From 0cd1d592ce4fcc2ee7e6a0a0b9f70dd146c578fd Mon Sep 17 00:00:00 2001 From: Nadir Fejzic Date: Thu, 10 Nov 2022 12:01:32 +0100 Subject: [PATCH] feat(inline): implement token resolver --- inline/src/lexer/mod.rs | 1 + inline/src/lexer/resolver/mod.rs | 226 +++++++++++++++++++++++++++++++ inline/src/lexer/token.rs | 32 +++-- 3 files changed, 251 insertions(+), 8 deletions(-) create mode 100644 inline/src/lexer/resolver/mod.rs diff --git a/inline/src/lexer/mod.rs b/inline/src/lexer/mod.rs index b0ccb993..86c2a2ba 100644 --- a/inline/src/lexer/mod.rs +++ b/inline/src/lexer/mod.rs @@ -2,6 +2,7 @@ use std::{iter::Peekable, str::Lines}; use unicode_segmentation::*; +mod resolver; mod token; pub use token::*; diff --git a/inline/src/lexer/resolver/mod.rs b/inline/src/lexer/resolver/mod.rs new file mode 100644 index 00000000..145873de --- /dev/null +++ b/inline/src/lexer/resolver/mod.rs @@ -0,0 +1,226 @@ +#![allow(dead_code)] +#![warn(clippy::pedantic)] + +use std::collections::{BTreeMap, VecDeque}; + +use crate::{Spacing, Span, Token, TokenIterator, TokenKind}; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +struct UnresolvedToken { + token: Token, + resolved: bool, + second_part: Option>, +} + +impl UnresolvedToken { + fn pop_second_part(&mut self) -> Option { + self.second_part.take().map(|boxed| *boxed) + } + + fn swap_parts(&mut self) { + if let Some(second_token) = self.second_part.as_mut() { + std::mem::swap(&mut self.token, &mut second_token.token); + std::mem::swap(&mut self.resolved, &mut second_token.resolved); + } + } + + fn split_ambiguous(&mut self) { + let mut token = Token { + kind: TokenKind::Plain, + span: Span::default(), + spacing: Spacing::default(), + content: None, + }; + + std::mem::swap(&mut self.token, &mut token); + + let (first, second) = token.split_ambiguous(); + self.token = first; + self.second_part = Some(Box::new(UnresolvedToken { + token: second, + resolved: false, + second_part: None, + })); + } +} + +pub(crate) struct TokenResolver<'a> { + iter: TokenIterator<'a>, + tokens: VecDeque, +} + +impl TokenResolver<'_> { + fn consume_line(&mut self) { + for token in self.iter.by_ref() { + let should_stop = matches!(token.kind(), TokenKind::Newline | TokenKind::EndOfLine); + + let unresolved_token = UnresolvedToken { + token, + resolved: false, + second_part: None, + }; + + self.tokens.push_back(unresolved_token); + + if should_stop { + break; + } + } + } + + pub(crate) fn resolve(&mut self) { + if self.tokens.is_empty() { + self.consume_line(); + } + + // map found tokens to their index in tokens vector + let mut token_map: BTreeMap> = BTreeMap::new(); + + for index in 0..self.tokens.len() { + // try to resolve token + self.resolve_token(&mut token_map, index); + + if !self.tokens[index].resolved { + // save positions of every unresolved token + token_map + .entry(self.tokens[index].token.kind) + .and_modify(|indices| indices.push(index)) + .or_insert_with(|| vec![index]); + } + } + } + + fn resolve_token(&mut self, token_map: &mut BTreeMap>, index: usize) { + // multiple cases for current - unresolved token relationship: + // 1. current NOT ambiguous, there is unresolved one that IS NOT ambiguous: (simple, simple) + // 2. current NOT ambiguous, there is unresolved one that IS ambiguous (ambiguous, simple) + // 3. current IS ambiguous, there is unresolved one that IS ambiguous (ambiguous, ambiguous) + // 4. current IS ambiguous, there is unresolved one that IS NOT ambiguous (simple, ambiguous) + + // (1. and 2.) current NOT ambiguous + if !self.tokens[index].token.is_ambiguous() && self.tokens[index].token.closes() { + // there is unresolved one that IS NOT ambiguous: (simple, simple) + if let Some(indices) = token_map.get_mut(&self.tokens[index].token.kind) { + if let Some((unr_token, i)) = self.find_first_unresolved(indices) { + // resolve them both + unr_token.resolved = true; + self.tokens[index].resolved = true; + + // remove unresolved token + indices.remove(i); + } + } else if let Some(ambiguous_variant) = + self.tokens[index].token.kind.get_ambiguous_variant() + { + if let Some(indices) = token_map.get_mut(&ambiguous_variant) { + let token_kind = self.tokens[index].token.kind; + if let Some((unr_token, i)) = self.find_first_unresolved(indices) { + // first token COULD be ambiguous + // first is ambiguous, so we have to split it + unr_token.split_ambiguous(); + + // easier manipulation on first part of ambiguous unresolved token + if unr_token.token.kind != token_kind { + unr_token.swap_parts(); + } + + // resolve + unr_token.resolved = true; + + if unr_token.token.kind == token_kind { + unr_token.swap_parts(); + } + + if unr_token.resolved { + indices.remove(i); + } + + self.tokens[index].resolved = true; + } + } + } + } + // (3. and 4.) current IS ambiguous + else if self.tokens[index].token.closes() { + // there is unresolved one that IS ambiguous (ambiguous, ambiguous) + if let Some(indices) = token_map.get_mut(&self.tokens[index].token.kind) { + if let Some((unr_token, i)) = self.find_first_unresolved(indices) { + // split them both + unr_token.split_ambiguous(); + + // resolve them both + unr_token.resolved = true; + if let Some(second) = unr_token.second_part.as_mut() { + second.resolved = true; + } + + self.tokens[index].split_ambiguous(); + self.tokens[index].resolved = true; + if let Some(second) = self.tokens[index].second_part.as_mut() { + second.resolved = true; + } + + indices.remove(i); + } + } else if let Some((first_kind, second_kind)) = + // there is unresolved one that IS NOT ambiguous (simple, ambiguous) + self.tokens[index].token.kind.get_ambiguous_parts() + { + let (indices, kind) = if let Some(indices) = token_map.get_mut(&first_kind) { + (indices, first_kind) + } else if let Some(indices) = token_map.get_mut(&second_kind) { + (indices, second_kind) + } else { + // we can't really do anything at this point + return; + }; + + if self.resolve_partial_kind(indices, index, kind) { + // try to resolve the remaining part + // Should fall into case (1. and 2.) + self.resolve_token(token_map, index); + } + } + } + } + + fn resolve_partial_kind( + &mut self, + indices: &mut Vec, + index: usize, + kind: TokenKind, + ) -> bool { + if let Some((unr_token, i)) = self.find_first_unresolved(indices) { + unr_token.resolved = true; + + let curr_token = &mut self.tokens[index]; + + curr_token.split_ambiguous(); + + if curr_token.token.kind != kind { + curr_token.swap_parts(); + } + curr_token.resolved = true; + + indices.remove(i); + + true + } else { + false + } + } + + fn find_first_unresolved( + &mut self, + indices: &[usize], + ) -> Option<(&mut UnresolvedToken, usize)> { + // find first unresolved token + for (i, idx) in indices.iter().enumerate() { + if !self.tokens[*idx].resolved && !self.tokens[*idx].token.opens() { + return Some((&mut self.tokens[*idx], i)); + } + } + + None + } +} diff --git a/inline/src/lexer/token.rs b/inline/src/lexer/token.rs index 5350a688..08c6f9b3 100644 --- a/inline/src/lexer/token.rs +++ b/inline/src/lexer/token.rs @@ -120,12 +120,12 @@ impl TokenBuilder { } /// Token lexed from Unimarkup text. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Token { - kind: TokenKind, - span: Span, - spacing: Spacing, - content: Option, + pub(crate) kind: TokenKind, + pub(crate) span: Span, + pub(crate) spacing: Spacing, + pub(crate) content: Option, } impl Token { @@ -404,7 +404,7 @@ impl Token { } /// The kind of the token found in Unimarkup document. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum TokenKind { /// Bold delimiter token (`**`). Bold, @@ -550,6 +550,22 @@ impl TokenKind { Self::CloseParens | Self::CloseBracket | Self::CloseBrace ) } + + pub(crate) fn get_ambiguous_variant(&self) -> Option { + match self { + TokenKind::Bold | TokenKind::Italic => Some(Self::ItalicBold), + TokenKind::Underline | TokenKind::Subscript => Some(Self::UnderlineSubscript), + _ => None, + } + } + + pub(crate) fn get_ambiguous_parts(&self) -> Option<(Self, Self)> { + match self { + Self::ItalicBold => Some((Self::Italic, Self::Bold)), + Self::UnderlineSubscript => Some((Self::Underline, Self::Subscript)), + _ => None, + } + } } impl From<&Inline> for TokenKind { @@ -704,7 +720,7 @@ impl TokenDelimiters { } /// Enum representing the spacing surrounding a particular token in Unimarkup document. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum Spacing { /// Whitespace before the token. Pre, @@ -785,7 +801,7 @@ impl Sub for Spacing { /// Span used to store information about the space some [`Token`] occupies in Unimarkup document. /// /// [`Token`]: self::Token -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct Span { start: Position, end: Position,