From b3caceb99ed3eab1d55bba723426c48b499b4825 Mon Sep 17 00:00:00 2001 From: Dennis van der Schagt Date: Mon, 22 Jul 2024 13:54:48 +0200 Subject: [PATCH] Implement parsing of Content-Type charset parameter --- include/charencoding.h | 2 + rust/libnewsboat-ffi/src/charencoding.rs | 14 +++ rust/libnewsboat/src/charencoding.rs | 116 ++++++++++++++++++++++- src/charencoding.cpp | 11 +++ test/charencoding.cpp | 23 +++++ 5 files changed, 165 insertions(+), 1 deletion(-) diff --git a/include/charencoding.h b/include/charencoding.h index 5333dc3be..95a85fc25 100644 --- a/include/charencoding.h +++ b/include/charencoding.h @@ -13,6 +13,8 @@ namespace charencoding { nonstd::optional charset_from_bom(std::vector content); nonstd::optional charset_from_xml_declaration(std::vector content); +nonstd::optional charset_from_content_type_header(std::vector + header); } // namespace charencoding } // namespace newsboat diff --git a/rust/libnewsboat-ffi/src/charencoding.rs b/rust/libnewsboat-ffi/src/charencoding.rs index d74b0abd1..396ec1871 100644 --- a/rust/libnewsboat-ffi/src/charencoding.rs +++ b/rust/libnewsboat-ffi/src/charencoding.rs @@ -5,6 +5,7 @@ mod bridged { extern "Rust" { fn charset_from_bom(content: &[u8], output: &mut String) -> bool; fn charset_from_xml_declaration(content: &[u8], output: &mut String) -> bool; + fn charset_from_content_type_header(content: &[u8], output: &mut String) -> bool; } } @@ -33,3 +34,16 @@ fn charset_from_xml_declaration(content: &[u8], output: &mut String) -> bool { None => false, } } + +// Temporarily ignore clippy lint until PR is merged: +// https://github.com/rust-lang/rust-clippy/pull/12756 +#[allow(clippy::assigning_clones)] +fn charset_from_content_type_header(content: &[u8], output: &mut String) -> bool { + match charencoding::charset_from_content_type_header(content) { + Some(charset) => { + *output = charset.to_owned(); + true + } + None => false, + } +} diff --git a/rust/libnewsboat/src/charencoding.rs b/rust/libnewsboat/src/charencoding.rs index e7d52886a..382ddcd7f 100644 --- a/rust/libnewsboat/src/charencoding.rs +++ b/rust/libnewsboat/src/charencoding.rs @@ -1,5 +1,5 @@ use nom::branch::alt; -use nom::bytes::complete::tag; +use nom::bytes::complete::{tag, take_till, take_till1}; use nom::character::complete::{alpha1, alphanumeric1, digit1, space0}; use nom::combinator::recognize; use nom::multi::many0; @@ -87,6 +87,60 @@ fn charset_from_ascii_xml_declaration(content: &[u8]) -> Option { .map(|(_, encoding)| encoding) } +pub fn charset_from_content_type_header(input: &[u8]) -> Option { + struct Parameter<'a> { + key: &'a [u8], + value: &'a [u8], + } + + fn parse_token(input: &[u8]) -> IResult<&[u8], &[u8]> { + take_till1(|c| c == b';' || c == b'=' || c == b'/' || c == b' ' || c == b'\t')(input) + } + + fn parse_quoted_string(input: &[u8]) -> IResult<&[u8], &[u8]> { + let (input, _) = tag(b"\"")(input)?; + let (input, text) = take_till(|c| c == b'"')(input)?; + let (input, _) = tag(b"\"")(input)?; + Ok((input, text)) + } + + fn parse_parameter(input: &[u8]) -> IResult<&[u8], Parameter> { + let (input, _) = space0(input)?; + let (input, _) = tag(b";")(input)?; + let (input, _) = space0(input)?; + let (input, key) = parse_token(input)?; + let (input, _) = tag(b"=")(input)?; + let (input, value) = alt((parse_quoted_string, parse_token))(input)?; + Ok((input, Parameter { key, value })) + } + + fn parse_media_type(input: &[u8]) -> IResult<&[u8], Vec> { + let (input, _type) = parse_token(input)?; + let (input, _) = tag(b"/")(input)?; + let (input, _subtype) = parse_token(input)?; + let (input, parameters) = many0(parse_parameter)(input)?; + Ok((input, parameters)) + } + + fn get_parameter(parameters: &[Parameter], name: &str) -> Option { + for Parameter { key, value } in parameters { + let key = str::from_utf8(key); + let value = str::from_utf8(value); + let (Ok(key), Ok(value)) = (key, value) else { + continue; + }; + if key.to_lowercase() == name.to_lowercase() { + return Some(value.to_owned()); + } + } + None + } + + parse_media_type(input) + .ok() + .and_then(|(_, parameters)| get_parameter(¶meters, "charset")) +} + #[cfg(test)] mod tests { use super::*; @@ -186,4 +240,64 @@ mod tests { Some("UTF-16BE".to_owned()) ); } + + #[test] + fn t_charset_from_content_type_header_without_charset_parameter() { + assert_eq!(charset_from_content_type_header(b""), None); + assert_eq!(charset_from_content_type_header(b"application/xml"), None); + assert_eq!( + charset_from_content_type_header(b"multipart/form-data; boundary=something"), + None + ); + } + + #[test] + fn t_charset_from_content_type_header_with_charset_parameter() { + assert_eq!( + charset_from_content_type_header(b"application/xml; charset=utf-8"), + Some("utf-8".to_owned()) + ); + assert_eq!( + charset_from_content_type_header( + b"multipart/form-data; boundary=something; charset=iso-8859-1" + ), + Some("iso-8859-1".to_owned()) + ); + } + + #[test] + fn t_charset_from_content_type_header_with_charset_parameter_quoted() { + assert_eq!( + charset_from_content_type_header(b"application/xml; charset=\"utf-8\""), + Some("utf-8".to_owned()) + ); + } + + #[test] + fn t_charset_from_content_type_header_with_charset_parameter_case_insensitive() { + assert_eq!( + charset_from_content_type_header(b"application/xml; Charset=utf-8"), + Some("utf-8".to_owned()) + ); + } + + #[test] + fn t_charset_from_content_type_header_with_charset_alternative_whitespace_usage() { + assert_eq!( + charset_from_content_type_header(b"application/xml;charset=utf-8"), + Some("utf-8".to_owned()) + ); + assert_eq!( + charset_from_content_type_header(b"application/xml\t \t;charset=utf-8"), + Some("utf-8".to_owned()) + ); + assert_eq!( + charset_from_content_type_header(b"application/xml;\t \tcharset=utf-8"), + Some("utf-8".to_owned()) + ); + assert_eq!( + charset_from_content_type_header(b"application/xml\t \t;\t \tcharset=utf-8"), + Some("utf-8".to_owned()) + ); + } } diff --git a/src/charencoding.cpp b/src/charencoding.cpp index db1402fa7..f2477c698 100644 --- a/src/charencoding.cpp +++ b/src/charencoding.cpp @@ -26,5 +26,16 @@ nonstd::optional charset_from_xml_declaration(std::vector charset_from_content_type_header(std::vector + header) +{ + rust::String charset; + const auto input = rust::Slice(header.data(), header.size()); + if (charencoding::bridged::charset_from_content_type_header(input, charset)) { + return std::string(charset); + } + return {}; +} + } // namespace charencoding } // namespace newsboat diff --git a/test/charencoding.cpp b/test/charencoding.cpp index cd656144a..cadb53d5e 100644 --- a/test/charencoding.cpp +++ b/test/charencoding.cpp @@ -52,3 +52,26 @@ TEST_CASE("charset_from_xml_declaration", "[charencoding]") REQUIRE(actual == expected); } } + +TEST_CASE("charset_from_content_type_header", "[charencoding]") +{ + const std::map> test_cases { + { "", nonstd::nullopt }, + { "application/xml", nonstd::nullopt }, + { "multipart/form-data; boundary=something", nonstd::nullopt }, + { "application/xml; charset=utf-8", "utf-8" }, + }; + + for (const auto& test_case : test_cases) { + std::vector input(test_case.first.begin(), test_case.first.end()); + + const auto actual = charencoding::charset_from_content_type_header(input); + const auto expected = test_case.second; + + INFO("input: " << test_case.first); + INFO("actual: " << (actual.has_value() ? actual.value().c_str() : "")); + INFO("expected: " << (expected.has_value() ? expected.value().c_str() : "")); + + REQUIRE(actual == expected); + } +}