From b3caceb99ed3eab1d55bba723426c48b499b4825 Mon Sep 17 00:00:00 2001
From: Dennis van der Schagt <dennisschagt@gmail.com>
Date: Mon, 22 Jul 2024 13:54:48 +0200
Subject: [PATCH] Implement parsing of Content-Type charset parameter

---
 include/charencoding.h                   |   2 +
 rust/libnewsboat-ffi/src/charencoding.rs |  14 +++
 rust/libnewsboat/src/charencoding.rs     | 116 ++++++++++++++++++++++-
 src/charencoding.cpp                     |  11 +++
 test/charencoding.cpp                    |  23 +++++
 5 files changed, 165 insertions(+), 1 deletion(-)
diff --git a/include/charencoding.h b/include/charencoding.h
index 5333dc3be..95a85fc25 100644
--- a/include/charencoding.h
+++ b/include/charencoding.h
@@ -13,6 +13,8 @@ namespace charencoding {
 nonstd::optional<std::string> charset_from_bom(std::vector<std::uint8_t> content);
 nonstd::optional<std::string> charset_from_xml_declaration(std::vector<std::uint8_t>
 	content);
+nonstd::optional<std::string> charset_from_content_type_header(std::vector<std::uint8_t>
+	header);
 
 } // namespace charencoding
 } // namespace newsboat
diff --git a/rust/libnewsboat-ffi/src/charencoding.rs b/rust/libnewsboat-ffi/src/charencoding.rs
index d74b0abd1..396ec1871 100644
--- a/rust/libnewsboat-ffi/src/charencoding.rs
+++ b/rust/libnewsboat-ffi/src/charencoding.rs
@@ -5,6 +5,7 @@ mod bridged {
     extern "Rust" {
         fn charset_from_bom(content: &[u8], output: &mut String) -> bool;
         fn charset_from_xml_declaration(content: &[u8], output: &mut String) -> bool;
+        fn charset_from_content_type_header(content: &[u8], output: &mut String) -> bool;
     }
 }
 
@@ -33,3 +34,16 @@ fn charset_from_xml_declaration(content: &[u8], output: &mut String) -> bool {
         None => false,
     }
 }
+
+// Temporarily ignore clippy lint until PR is merged:
+// https://github.com/rust-lang/rust-clippy/pull/12756
+#[allow(clippy::assigning_clones)]
+fn charset_from_content_type_header(content: &[u8], output: &mut String) -> bool {
+    match charencoding::charset_from_content_type_header(content) {
+        Some(charset) => {
+            *output = charset.to_owned();
+            true
+        }
+        None => false,
+    }
+}
diff --git a/rust/libnewsboat/src/charencoding.rs b/rust/libnewsboat/src/charencoding.rs
index e7d52886a..382ddcd7f 100644
--- a/rust/libnewsboat/src/charencoding.rs
+++ b/rust/libnewsboat/src/charencoding.rs
@@ -1,5 +1,5 @@
 use nom::branch::alt;
-use nom::bytes::complete::tag;
+use nom::bytes::complete::{tag, take_till, take_till1};
 use nom::character::complete::{alpha1, alphanumeric1, digit1, space0};
 use nom::combinator::recognize;
 use nom::multi::many0;
@@ -87,6 +87,60 @@ fn charset_from_ascii_xml_declaration(content: &[u8]) -> Option<String> {
         .map(|(_, encoding)| encoding)
 }
 
+pub fn charset_from_content_type_header(input: &[u8]) -> Option<String> {
+    struct Parameter<'a> {
+        key: &'a [u8],
+        value: &'a [u8],
+    }
+
+    fn parse_token(input: &[u8]) -> IResult<&[u8], &[u8]> {
+        take_till1(|c| c == b';' || c == b'=' || c == b'/' || c == b' ' || c == b'\t')(input)
+    }
+
+    fn parse_quoted_string(input: &[u8]) -> IResult<&[u8], &[u8]> {
+        let (input, _) = tag(b"\"")(input)?;
+        let (input, text) = take_till(|c| c == b'"')(input)?;
+        let (input, _) = tag(b"\"")(input)?;
+        Ok((input, text))
+    }
+
+    fn parse_parameter(input: &[u8]) -> IResult<&[u8], Parameter> {
+        let (input, _) = space0(input)?;
+        let (input, _) = tag(b";")(input)?;
+        let (input, _) = space0(input)?;
+        let (input, key) = parse_token(input)?;
+        let (input, _) = tag(b"=")(input)?;
+        let (input, value) = alt((parse_quoted_string, parse_token))(input)?;
+        Ok((input, Parameter { key, value }))
+    }
+
+    fn parse_media_type(input: &[u8]) -> IResult<&[u8], Vec<Parameter>> {
+        let (input, _type) = parse_token(input)?;
+        let (input, _) = tag(b"/")(input)?;
+        let (input, _subtype) = parse_token(input)?;
+        let (input, parameters) = many0(parse_parameter)(input)?;
+        Ok((input, parameters))
+    }
+
+    fn get_parameter(parameters: &[Parameter], name: &str) -> Option<String> {
+        for Parameter { key, value } in parameters {
+            let key = str::from_utf8(key);
+            let value = str::from_utf8(value);
+            let (Ok(key), Ok(value)) = (key, value) else {
+                continue;
+            };
+            if key.to_lowercase() == name.to_lowercase() {
+                return Some(value.to_owned());
+            }
+        }
+        None
+    }
+
+    parse_media_type(input)
+        .ok()
+        .and_then(|(_, parameters)| get_parameter(&parameters, "charset"))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -186,4 +240,64 @@ mod tests {
             Some("UTF-16BE".to_owned())
         );
     }
+
+    #[test]
+    fn t_charset_from_content_type_header_without_charset_parameter() {
+        assert_eq!(charset_from_content_type_header(b""), None);
+        assert_eq!(charset_from_content_type_header(b"application/xml"), None);
+        assert_eq!(
+            charset_from_content_type_header(b"multipart/form-data; boundary=something"),
+            None
+        );
+    }
+
+    #[test]
+    fn t_charset_from_content_type_header_with_charset_parameter() {
+        assert_eq!(
+            charset_from_content_type_header(b"application/xml; charset=utf-8"),
+            Some("utf-8".to_owned())
+        );
+        assert_eq!(
+            charset_from_content_type_header(
+                b"multipart/form-data; boundary=something; charset=iso-8859-1"
+            ),
+            Some("iso-8859-1".to_owned())
+        );
+    }
+
+    #[test]
+    fn t_charset_from_content_type_header_with_charset_parameter_quoted() {
+        assert_eq!(
+            charset_from_content_type_header(b"application/xml; charset=\"utf-8\""),
+            Some("utf-8".to_owned())
+        );
+    }
+
+    #[test]
+    fn t_charset_from_content_type_header_with_charset_parameter_case_insensitive() {
+        assert_eq!(
+            charset_from_content_type_header(b"application/xml; Charset=utf-8"),
+            Some("utf-8".to_owned())
+        );
+    }
+
+    #[test]
+    fn t_charset_from_content_type_header_with_charset_alternative_whitespace_usage() {
+        assert_eq!(
+            charset_from_content_type_header(b"application/xml;charset=utf-8"),
+            Some("utf-8".to_owned())
+        );
+        assert_eq!(
+            charset_from_content_type_header(b"application/xml\t \t;charset=utf-8"),
+            Some("utf-8".to_owned())
+        );
+        assert_eq!(
+            charset_from_content_type_header(b"application/xml;\t \tcharset=utf-8"),
+            Some("utf-8".to_owned())
+        );
+        assert_eq!(
+            charset_from_content_type_header(b"application/xml\t \t;\t \tcharset=utf-8"),
+            Some("utf-8".to_owned())
+        );
+    }
 }
diff --git a/src/charencoding.cpp b/src/charencoding.cpp
index db1402fa7..f2477c698 100644
--- a/src/charencoding.cpp
+++ b/src/charencoding.cpp
@@ -26,5 +26,16 @@ nonstd::optional<std::string> charset_from_xml_declaration(std::vector<std::uint
 	return {};
 }
 
+nonstd::optional<std::string> charset_from_content_type_header(std::vector<std::uint8_t>
+	header)
+{
+	rust::String charset;
+	const auto input = rust::Slice<const std::uint8_t>(header.data(), header.size());
+	if (charencoding::bridged::charset_from_content_type_header(input, charset)) {
+		return std::string(charset);
+	}
+	return {};
+}
+
 } // namespace charencoding
 } // namespace newsboat
diff --git a/test/charencoding.cpp b/test/charencoding.cpp
index cd656144a..cadb53d5e 100644
--- a/test/charencoding.cpp
+++ b/test/charencoding.cpp
@@ -52,3 +52,26 @@ TEST_CASE("charset_from_xml_declaration", "[charencoding]")
 		REQUIRE(actual == expected);
 	}
 }
+
+TEST_CASE("charset_from_content_type_header", "[charencoding]")
+{
+	const std::map<std::string, nonstd::optional<std::string>> test_cases {
+		{ "", nonstd::nullopt },
+		{ "application/xml", nonstd::nullopt },
+		{ "multipart/form-data; boundary=something", nonstd::nullopt },
+		{ "application/xml; charset=utf-8", "utf-8" },
+	};
+
+	for (const auto& test_case : test_cases) {
+		std::vector<std::uint8_t> input(test_case.first.begin(), test_case.first.end());
+
+		const auto actual = charencoding::charset_from_content_type_header(input);
+		const auto expected = test_case.second;
+
+		INFO("input: " << test_case.first);
+		INFO("actual: " << (actual.has_value() ? actual.value().c_str() : ""));
+		INFO("expected: " << (expected.has_value() ? expected.value().c_str() : ""));
+
+		REQUIRE(actual == expected);
+	}
+}