Skip to content

Commit

Permalink
feat(inline): switch to grapheme iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
mhatzl committed Mar 14, 2022
1 parent 0ccdff1 commit 16eed17
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 50 deletions.
94 changes: 54 additions & 40 deletions inline/src/parser/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,29 +1,44 @@
use std::{collections::{HashMap, hash_map::Entry::Vacant}, cmp::min};

use unicode_segmentation::{UnicodeSegmentation, Graphemes};

use crate::Position;

use super::tokens::{Token, TokenKind, AsSingleTokenKind, SingleTokenKind};


#[derive(Debug, Default)]
struct Tokenized {
#[derive(Debug)]
struct Tokenized<'a> {
graphemes: Graphemes<'a>,
tokens: Vec::<Token>,
open_tokens: HashMap::<TokenKind, usize>,
cur_pos: Position,
escape_active: bool,
}

impl<'a> Tokenized<'a> {
fn new(content: &'a str) -> Self {
Self {
graphemes: content.graphemes(true),
tokens: Default::default(),
open_tokens: Default::default(),
cur_pos: Default::default(),
escape_active: Default::default()
}
}
}

pub(crate) trait Tokenizer {
fn tokenize(self) -> Vec<Token>;
}

impl Tokenizer for &str {
fn tokenize(self) -> Vec<Token> {
let mut tokenized = Tokenized::default();
let mut tokenized = Tokenized::new(self);
let mut last_token_index = 0;

for c in self.chars() {
update_tokens(&mut tokenized, c);
while let Some(grapheme) = tokenized.graphemes.next() {
update_tokens(&mut tokenized, grapheme);

let updated_last_token_index = if tokenized.tokens.is_empty() { 0 } else { tokenized.tokens.len() - 1 };
if last_token_index != updated_last_token_index && updated_last_token_index > 0 {
Expand All @@ -45,18 +60,17 @@ impl Tokenizer for &str {
}
}

fn update_tokens(tokenized: &mut Tokenized, c: char) {
fn update_tokens(tokenized: &mut Tokenized, grapheme: &str) {
if tokenized.escape_active {
update_escaped(tokenized, c);
update_escaped(tokenized, grapheme);
tokenized.escape_active = false;
} else {
let single_token_kind = c.as_single_token_kind();
// only single char tokens need to be handled here, because `c` is only one char
let single_token_kind = grapheme.as_single_token_kind();
// only single grapheme tokens need to be handled here, because only single grapheme is handled per update
match single_token_kind {
SingleTokenKind::Plain => update_plain(tokenized, c),
SingleTokenKind::LineFeed | SingleTokenKind::CarriageReturn => todo!(),
SingleTokenKind::Tab => todo!(),
SingleTokenKind::Space => update_space(tokenized, c),
SingleTokenKind::Plain => update_plain(tokenized, grapheme),
SingleTokenKind::Newline => todo!(),
SingleTokenKind::Space => update_space(tokenized, grapheme),
SingleTokenKind::Backslash => {
tokenized.escape_active = true;
tokenized.cur_pos.column += 1;
Expand All @@ -66,7 +80,7 @@ fn update_tokens(tokenized: &mut Tokenized, c: char) {
SingleTokenKind::Colon => todo!(),
SingleTokenKind::Caret => todo!(),
SingleTokenKind::Underscore => todo!(),
SingleTokenKind::Asterisk => update_asterisk(tokenized, c),
SingleTokenKind::Asterisk => update_asterisk(tokenized, grapheme),
SingleTokenKind::Plus => todo!(),
}
}
Expand Down Expand Up @@ -173,45 +187,45 @@ fn update_open_map(tokenized: &mut Tokenized, next_token_is_space_or_newline: bo
}
}

fn update_plain(tokenized: &mut Tokenized, c: char) {
fn update_plain(tokenized: &mut Tokenized, grapheme: &str) {
if let Some(last) = tokenized.tokens.last_mut() {
if last.kind == TokenKind::Plain {
last.content.push(c);
last.content.push_str(grapheme);
} else {
tokenized.cur_pos.column += last.length();
let new_token = Token{ kind: TokenKind::Plain, content: c.to_string(), position: tokenized.cur_pos };
let new_token = Token{ kind: TokenKind::Plain, content: grapheme.to_string(), position: tokenized.cur_pos };
tokenized.tokens.push(new_token);
}
} else {
let new_token = Token{ kind: TokenKind::Plain, content: c.to_string(), position: tokenized.cur_pos };
let new_token = Token{ kind: TokenKind::Plain, content: grapheme.to_string(), position: tokenized.cur_pos };
tokenized.tokens.push(new_token);
}
}

fn update_escaped(tokenized: &mut Tokenized, c: char) {
tokenized.tokens.push(Token{ kind: TokenKind::EscapedChar, content: c.to_string(), position: tokenized.cur_pos });
fn update_escaped(tokenized: &mut Tokenized, grapheme: &str) {
tokenized.tokens.push(Token{ kind: TokenKind::EscapedChar, content: grapheme.to_string(), position: tokenized.cur_pos });
}

fn update_space(tokenized: &mut Tokenized, c: char) {
fn update_space(tokenized: &mut Tokenized, grapheme: &str) {
if let Some(last) = tokenized.tokens.last_mut() {
if last.kind == TokenKind::Space {
last.content.push(c);
last.content.push_str(grapheme);
} else {
tokenized.cur_pos.column += last.length();
let new_token = Token{ kind: TokenKind::Space, content: c.to_string(), position: tokenized.cur_pos };
let new_token = Token{ kind: TokenKind::Space, content: grapheme.to_string(), position: tokenized.cur_pos };
tokenized.tokens.push(new_token);
}
} else {
let new_token = Token{ kind: TokenKind::Space, content: c.to_string(), position: tokenized.cur_pos };
let new_token = Token{ kind: TokenKind::Space, content: grapheme.to_string(), position: tokenized.cur_pos };
tokenized.tokens.push(new_token);
}
}

fn update_asterisk(tokenized: &mut Tokenized, c: char) {
fn update_asterisk(tokenized: &mut Tokenized, grapheme: &str) {
match tokenized.tokens.pop() {
Some(mut last) => {
if last.kind == TokenKind::ItalicOpen {
last.content.push(c);
last.content.push_str(grapheme);

if tokenized.open_tokens.get(&TokenKind::BoldOpen).is_some() {
let preceding_token = tokenized.tokens.last().expect("Tokens must not be empty, because open token exists");
Expand Down Expand Up @@ -253,7 +267,7 @@ fn update_asterisk(tokenized: &mut Tokenized, c: char) {
}
} else {
last.kind = TokenKind::BoldItalicOpen;
last.content.push(c);
last.content.push_str(grapheme);
tokenized.tokens.push(last);
}
} else if last.kind == TokenKind::BoldItalicOpen {
Expand Down Expand Up @@ -292,7 +306,7 @@ fn update_asterisk(tokenized: &mut Tokenized, c: char) {
tokenized.tokens.push(italic_open_token);
} else {
last.kind = TokenKind::Plain;
last.content.push(c);
last.content.push_str(grapheme);
match tokenized.tokens.last_mut() {
Some(prev) => {
if prev.kind == TokenKind::Plain {
Expand All @@ -309,52 +323,52 @@ fn update_asterisk(tokenized: &mut Tokenized, c: char) {
} else if last.kind == TokenKind::ItalicClose {
if tokenized.open_tokens.contains_key(&TokenKind::BoldItalicOpen) {
last.kind = TokenKind::BoldClose;
last.content.push(c);
last.content.push_str(grapheme);
tokenized.tokens.push(last);
} else if let Some(bold_index) = tokenized.open_tokens.get(&TokenKind::BoldOpen) {
match tokenized.open_tokens.get(&TokenKind::ItalicOpen) {
Some(italic_index) => {
if italic_index < bold_index {
last.kind = TokenKind::BoldClose;
last.content.push(c);
last.content.push_str(grapheme);
tokenized.tokens.push(last);
} else {
last.kind = TokenKind::ItalicClose;
tokenized.cur_pos.column += last.length();
tokenized.tokens.push(last);
tokenized.tokens.push(Token {
kind: TokenKind::ItalicOpen, content: c.to_string(), position: tokenized.cur_pos
kind: TokenKind::ItalicOpen, content: grapheme.to_string(), position: tokenized.cur_pos
})
}
},
None => {
last.kind = TokenKind::BoldClose;
last.content.push(c);
last.content.push_str(grapheme);
tokenized.tokens.push(last);
},
}
} else {
last.kind = TokenKind::BoldOpen;
last.content.push(c);
last.content.push_str(grapheme);
tokenized.tokens.push(last);
}
} else if last.kind == TokenKind::BoldClose {
if tokenized.open_tokens.contains_key(&TokenKind::BoldItalicOpen) {
last.content.push(c);
last.content.push_str(grapheme);
last.kind = TokenKind::BoldItalicClose;
tokenized.tokens.push(last);
} else {
// Note: handles `**bold***italic*` -> [bo]bold[bc][io]italic[ic]
tokenized.cur_pos.column += last.length();
tokenized.tokens.push(last);
let new_token = Token{ kind: TokenKind::ItalicOpen, content: c.to_string(), position: tokenized.cur_pos };
let new_token = Token{ kind: TokenKind::ItalicOpen, content: grapheme.to_string(), position: tokenized.cur_pos };
tokenized.tokens.push(new_token);
}
} else if last.kind == TokenKind::BoldItalicClose {
// Note: handles `***bold & italic****italic*` -> [bio]bold & italic[bic][io]italic[ic]
tokenized.cur_pos.column += last.length();
tokenized.tokens.push(last);
let new_token = Token{ kind: TokenKind::ItalicOpen, content: c.to_string(), position: tokenized.cur_pos };
let new_token = Token{ kind: TokenKind::ItalicOpen, content: grapheme.to_string(), position: tokenized.cur_pos };
tokenized.tokens.push(new_token);
} else {
let new_token;
Expand All @@ -365,20 +379,20 @@ fn update_asterisk(tokenized: &mut Tokenized, c: char) {

if last.is_space_or_newline() {
// Note: closing not allowed after space
new_token = Token{ kind: TokenKind::ItalicOpen, content: c.to_string(), position: tokenized.cur_pos };
new_token = Token{ kind: TokenKind::ItalicOpen, content: grapheme.to_string(), position: tokenized.cur_pos };
} else {
new_token = Token{ kind: TokenKind::ItalicClose, content: c.to_string(), position: tokenized.cur_pos };
new_token = Token{ kind: TokenKind::ItalicClose, content: grapheme.to_string(), position: tokenized.cur_pos };
}
} else {
new_token = Token{ kind: TokenKind::ItalicOpen, content: c.to_string(), position: tokenized.cur_pos };
new_token = Token{ kind: TokenKind::ItalicOpen, content: grapheme.to_string(), position: tokenized.cur_pos };
}

tokenized.tokens.push(last);
tokenized.tokens.push(new_token);
}
},
None => {
let new_token = Token{ kind: TokenKind::ItalicOpen, content: c.to_string(), position: tokenized.cur_pos };
let new_token = Token{ kind: TokenKind::ItalicOpen, content: grapheme.to_string(), position: tokenized.cur_pos };
tokenized.tokens.push(new_token);
},
}
Expand Down
22 changes: 12 additions & 10 deletions inline/src/parser/tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ impl Token {
#[derive(Debug, Clone, PartialEq)]
pub enum SingleTokenKind {
Plain,
LineFeed,
CarriageReturn,
Tab,
Newline,
Space,
Backslash,
ExclamationMark,
Expand Down Expand Up @@ -99,13 +97,15 @@ pub trait AsSingleTokenKind {
fn as_single_token_kind(&self) -> SingleTokenKind;
}

impl AsSingleTokenKind for char {
impl AsSingleTokenKind for &str {
fn as_single_token_kind(&self) -> SingleTokenKind {
match *self {
'*' => { SingleTokenKind::Asterisk },
'\\' => { SingleTokenKind::Backslash },
c => {
if c.is_whitespace() {
"*" => { SingleTokenKind::Asterisk },
"\\" => { SingleTokenKind::Backslash },
grapheme => {
if grapheme.is_newline() {
return SingleTokenKind::Newline;
} else if grapheme.trim().is_empty() {
return SingleTokenKind::Space;
}
SingleTokenKind::Plain
Expand Down Expand Up @@ -142,9 +142,11 @@ pub trait Newline {
}

impl Newline for &str {
/// Note: Only temporary solution until rust supports is_newline() per default.
///
/// Treats `\n`, `\r\n` and `\r` as one newline.
fn is_newline(&self) -> bool {
let s = *self;
//Note: Only temporary solution until rust supports is_newline() per default
let s = *self;
s == "\n" || s == "\r\n" || s == "\r"
}
}
Expand Down

0 comments on commit 16eed17

Please sign in to comment.