Skip to content

Commit

Permalink
Merge pull request newsboat#2802 from dennisschagt/parse-content-type…
Browse files Browse the repository at this point in the history
…-charset

Implement parsing of Content-Type charset parameter
  • Loading branch information
dennisschagt committed Jul 22, 2024
2 parents 93ea967 + b3caceb commit 2ffe841
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 1 deletion.
2 changes: 2 additions & 0 deletions include/charencoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ namespace charencoding {
nonstd::optional<std::string> charset_from_bom(std::vector<std::uint8_t> content);
nonstd::optional<std::string> charset_from_xml_declaration(std::vector<std::uint8_t>
content);
nonstd::optional<std::string> charset_from_content_type_header(std::vector<std::uint8_t>
header);

} // namespace charencoding
} // namespace newsboat
Expand Down
14 changes: 14 additions & 0 deletions rust/libnewsboat-ffi/src/charencoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ mod bridged {
extern "Rust" {
fn charset_from_bom(content: &[u8], output: &mut String) -> bool;
fn charset_from_xml_declaration(content: &[u8], output: &mut String) -> bool;
fn charset_from_content_type_header(content: &[u8], output: &mut String) -> bool;
}
}

Expand Down Expand Up @@ -33,3 +34,16 @@ fn charset_from_xml_declaration(content: &[u8], output: &mut String) -> bool {
None => false,
}
}

// Temporarily ignore clippy lint until PR is merged:
// https://github.com/rust-lang/rust-clippy/pull/12756
#[allow(clippy::assigning_clones)]
fn charset_from_content_type_header(content: &[u8], output: &mut String) -> bool {
match charencoding::charset_from_content_type_header(content) {
Some(charset) => {
*output = charset.to_owned();
true
}
None => false,
}
}
116 changes: 115 additions & 1 deletion rust/libnewsboat/src/charencoding.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::bytes::complete::{tag, take_till, take_till1};
use nom::character::complete::{alpha1, alphanumeric1, digit1, space0};
use nom::combinator::recognize;
use nom::multi::many0;
Expand Down Expand Up @@ -87,6 +87,60 @@ fn charset_from_ascii_xml_declaration(content: &[u8]) -> Option<String> {
.map(|(_, encoding)| encoding)
}

pub fn charset_from_content_type_header(input: &[u8]) -> Option<String> {
struct Parameter<'a> {
key: &'a [u8],
value: &'a [u8],
}

fn parse_token(input: &[u8]) -> IResult<&[u8], &[u8]> {
take_till1(|c| c == b';' || c == b'=' || c == b'/' || c == b' ' || c == b'\t')(input)
}

fn parse_quoted_string(input: &[u8]) -> IResult<&[u8], &[u8]> {
let (input, _) = tag(b"\"")(input)?;
let (input, text) = take_till(|c| c == b'"')(input)?;
let (input, _) = tag(b"\"")(input)?;
Ok((input, text))
}

fn parse_parameter(input: &[u8]) -> IResult<&[u8], Parameter> {
let (input, _) = space0(input)?;
let (input, _) = tag(b";")(input)?;
let (input, _) = space0(input)?;
let (input, key) = parse_token(input)?;
let (input, _) = tag(b"=")(input)?;
let (input, value) = alt((parse_quoted_string, parse_token))(input)?;
Ok((input, Parameter { key, value }))
}

fn parse_media_type(input: &[u8]) -> IResult<&[u8], Vec<Parameter>> {
let (input, _type) = parse_token(input)?;
let (input, _) = tag(b"/")(input)?;
let (input, _subtype) = parse_token(input)?;
let (input, parameters) = many0(parse_parameter)(input)?;
Ok((input, parameters))
}

fn get_parameter(parameters: &[Parameter], name: &str) -> Option<String> {
for Parameter { key, value } in parameters {
let key = str::from_utf8(key);
let value = str::from_utf8(value);
let (Ok(key), Ok(value)) = (key, value) else {
continue;
};
if key.to_lowercase() == name.to_lowercase() {
return Some(value.to_owned());
}
}
None
}

parse_media_type(input)
.ok()
.and_then(|(_, parameters)| get_parameter(&parameters, "charset"))
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -186,4 +240,64 @@ mod tests {
Some("UTF-16BE".to_owned())
);
}

#[test]
fn t_charset_from_content_type_header_without_charset_parameter() {
assert_eq!(charset_from_content_type_header(b""), None);
assert_eq!(charset_from_content_type_header(b"application/xml"), None);
assert_eq!(
charset_from_content_type_header(b"multipart/form-data; boundary=something"),
None
);
}

#[test]
fn t_charset_from_content_type_header_with_charset_parameter() {
assert_eq!(
charset_from_content_type_header(b"application/xml; charset=utf-8"),
Some("utf-8".to_owned())
);
assert_eq!(
charset_from_content_type_header(
b"multipart/form-data; boundary=something; charset=iso-8859-1"
),
Some("iso-8859-1".to_owned())
);
}

#[test]
fn t_charset_from_content_type_header_with_charset_parameter_quoted() {
assert_eq!(
charset_from_content_type_header(b"application/xml; charset=\"utf-8\""),
Some("utf-8".to_owned())
);
}

#[test]
fn t_charset_from_content_type_header_with_charset_parameter_case_insensitive() {
assert_eq!(
charset_from_content_type_header(b"application/xml; Charset=utf-8"),
Some("utf-8".to_owned())
);
}

#[test]
fn t_charset_from_content_type_header_with_charset_alternative_whitespace_usage() {
assert_eq!(
charset_from_content_type_header(b"application/xml;charset=utf-8"),
Some("utf-8".to_owned())
);
assert_eq!(
charset_from_content_type_header(b"application/xml\t \t;charset=utf-8"),
Some("utf-8".to_owned())
);
assert_eq!(
charset_from_content_type_header(b"application/xml;\t \tcharset=utf-8"),
Some("utf-8".to_owned())
);
assert_eq!(
charset_from_content_type_header(b"application/xml\t \t;\t \tcharset=utf-8"),
Some("utf-8".to_owned())
);
}
}
11 changes: 11 additions & 0 deletions src/charencoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,16 @@ nonstd::optional<std::string> charset_from_xml_declaration(std::vector<std::uint
return {};
}

nonstd::optional<std::string> charset_from_content_type_header(std::vector<std::uint8_t>
header)
{
rust::String charset;
const auto input = rust::Slice<const std::uint8_t>(header.data(), header.size());
if (charencoding::bridged::charset_from_content_type_header(input, charset)) {
return std::string(charset);
}
return {};
}

} // namespace charencoding
} // namespace newsboat
23 changes: 23 additions & 0 deletions test/charencoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,26 @@ TEST_CASE("charset_from_xml_declaration", "[charencoding]")
REQUIRE(actual == expected);
}
}

TEST_CASE("charset_from_content_type_header", "[charencoding]")
{
const std::map<std::string, nonstd::optional<std::string>> test_cases {
{ "", nonstd::nullopt },
{ "application/xml", nonstd::nullopt },
{ "multipart/form-data; boundary=something", nonstd::nullopt },
{ "application/xml; charset=utf-8", "utf-8" },
};

for (const auto& test_case : test_cases) {
std::vector<std::uint8_t> input(test_case.first.begin(), test_case.first.end());

const auto actual = charencoding::charset_from_content_type_header(input);
const auto expected = test_case.second;

INFO("input: " << test_case.first);
INFO("actual: " << (actual.has_value() ? actual.value().c_str() : ""));
INFO("expected: " << (expected.has_value() ? expected.value().c_str() : ""));

REQUIRE(actual == expected);
}
}

0 comments on commit 2ffe841

Please sign in to comment.