Skip to content

Commit

Permalink
Refactor Japanese tokenizer code
Browse files Browse the repository at this point in the history
  • Loading branch information
valeriansaliou committed Aug 12, 2023
1 parent 8c3aa28 commit 990e322
Showing 1 changed file with 14 additions and 5 deletions.
19 changes: 14 additions & 5 deletions src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ lazy_static! {
mode: lindera_core::mode::Mode::Normal,
}
)
.expect("unable to initialize Japanese tokenizer");
.expect("unable to initialize japanese tokenizer");
}

impl TokenLexerBuilder {
Expand Down Expand Up @@ -254,8 +254,9 @@ impl<'a> TokenLexer<'a> {
#[cfg(feature = "tokenizer-japanese")]
Some(Lang::Jpn) => match TOKENIZER_LINDERA.tokenize(text) {
Ok(tokens) => TokenLexerWords::Lindera(tokens.into_iter()),
Err(e) => {
warn!("unable to tokenize via lindera, falling back to the built-in tokenizer: {}", e);
Err(err) => {
warn!("unable to tokenize japanese, falling back: {}", err);

TokenLexerWords::UAX29(text.unicode_words())
}
},
Expand Down Expand Up @@ -350,7 +351,7 @@ impl<'a> Iterator for TokenLexerWords<'a> {

#[cfg(feature = "tokenizer-japanese")]
TokenLexerWords::Lindera(token) => match token.next() {
Some(t) => Some(t.text),
Some(inner) => Some(inner.text),
None => None,
},
}
Expand Down Expand Up @@ -450,7 +451,7 @@ mod tests {

#[cfg(feature = "tokenizer-japanese")]
#[test]
fn it_cleans_token_japanese_lindera() {
fn it_cleans_token_japanese_lindera_product() {
let mut token_cleaner = TokenLexerBuilder::from(
TokenLexerMode::NormalizeAndCleanup(None),
"関西国際空港限定トートバッグ",
Expand All @@ -471,7 +472,11 @@ mod tests {
Some(("バッグ".to_string(), 3515727814))
);
assert_eq!(token_cleaner.next(), None);
}

#[cfg(feature = "tokenizer-japanese")]
#[test]
fn it_cleans_token_japanese_lindera_food() {
let token_cleaner =
TokenLexerBuilder::from(TokenLexerMode::NormalizeAndCleanup(None), "𠮷野家").unwrap();

Expand All @@ -482,7 +487,11 @@ mod tests {
.unwrap();

assert_eq!(token_cleaner.locale, None);
}

#[cfg(feature = "tokenizer-japanese")]
#[test]
fn it_cleans_token_japanese_lindera_sentence() {
let mut token_cleaner = TokenLexerBuilder::from(
TokenLexerMode::NormalizeAndCleanup(None),
"𠮷野家でヱビスビールを飲んだ",
Expand Down

0 comments on commit 990e322

Please sign in to comment.