chore: merge pull request #77 from Unimarkup/inlines-token-disambigua…

…tion feat: improve lexer and parser
unimarkup · Jan 6, 2023 · 21f8024 · 21f8024
2 parents c97e898 + 0518355
commit 21f8024
Show file tree

Hide file tree

Showing 9 changed files with 939 additions and 684 deletions.
diff --git a/core/src/elements/inlines.rs b/core/src/elements/inlines.rs
@@ -17,7 +17,7 @@ impl Render for TokenDelimiters {
             TokenKind::Overline => "<span style='text-decoration: overline;'>",
             TokenKind::Strikethrough => "<span style='text-decoration: line-through;'>",
             TokenKind::Highlight => "<span style='background-color: #ffaaaa;'>",
-            TokenKind::Verbatim => "<pre><code>",
+            TokenKind::Verbatim => "<code>",
             TokenKind::Quote => "<span class='quote'>",
             TokenKind::Math => "<span class='math'>",
             TokenKind::OpenParens => "(",
@@ -47,7 +47,7 @@ impl Render for TokenDelimiters {
                 TokenKind::Overline => "</span>",
                 TokenKind::Strikethrough => "</span>",
                 TokenKind::Highlight => "</span>",
-                TokenKind::Verbatim => "</code></pre>",
+                TokenKind::Verbatim => "</code>",
                 TokenKind::Quote => "</span>",
                 TokenKind::Math => "</span>",
                 TokenKind::OpenParens => "(",

diff --git a/inline/src/inlines/mod.rs b/inline/src/inlines/mod.rs
@@ -1,5 +1,3 @@
-use std::collections::VecDeque;
-
 use crate::{Span, TokenDelimiters, TokenKind};
 
 mod content;
@@ -85,26 +83,12 @@ impl Inline {
     /// [`Inline`]: self::Inline
     /// [`TokenKind`]: crate::TokenKind
     /// [`InlineContent`]: self::content::InlineContent
-    pub fn new(mut content: InlineContent<PlainContent, NestedContent>, kind: TokenKind) -> Self {
+    pub fn new(content: InlineContent<PlainContent, NestedContent>, kind: TokenKind) -> Self {
         let consume_as_plain = |content| match content {
             InlineContent::Plain(plain_content) => Self::Plain(plain_content),
             InlineContent::Nested(nested_content) => Self::Multiple(nested_content),
         };
 
-        let span = content.span();
-        if let InlineContent::Nested(ref mut nested) = content {
-            // try to flatten content more
-            if nested.content.len() == 1 {
-                let inline = &mut nested.content[0];
-
-                if matches!(inline.as_ref(), InlineContent::Nested(_)) {
-                    content = nested.content.pop_back().unwrap().into_inner();
-                    dbg!(&content);
-                    content.set_span(span);
-                }
-            }
-        }
-
         match kind {
             TokenKind::Bold => Self::Bold(content.into()),
             TokenKind::Italic => Self::Italic(content.into()),
@@ -214,13 +198,6 @@ impl Inline {
         }
     }
 
-    /// Checks whether this [`Inline`] is a `Plain` text constructed from multiple other [`Inline`]s.
-    ///
-    /// [`Inline`]: self::Inline
-    fn is_multiple(&self) -> bool {
-        matches!(self, Inline::Multiple(_))
-    }
-
     /// Consumes this [`Inline`] and returns the inner [`InlineContent`] of it.
     ///
     /// [`Inline`]: self::Inline
@@ -343,47 +320,6 @@ impl Inline {
             | Inline::Substitution(content) => InlineContent::Nested(content),
         }
     }
-
-    /// Merges this [`Inline`] with another into one combined [`Inline`]. Since the other [`Inline`] might
-    /// contain multiple inlines inside, some of which aren't compatible with this one, the remaining [`Inline`]s
-    /// are returned in a [`VecDeque`].
-    ///
-    /// [`Inline`]: self::Inline
-    /// [`VecDeque`]: std::collections::VecDeque
-    pub(crate) fn merge(self, next_inline: Inline) -> (Inline, VecDeque<Inline>) {
-        let own_kind = TokenKind::from(&self);
-        let is_multiple = next_inline.is_multiple();
-
-        let mut current_content = self.into_inner();
-        let next_content = next_inline.into_inner();
-
-        let rest_of_inlines = match next_content {
-            InlineContent::Plain(plain_content) => {
-                // merge plains trivially
-                current_content.append(plain_content.into());
-                VecDeque::default()
-            }
-            InlineContent::Nested(nested_inlines) => {
-                let mut content = nested_inlines.content;
-
-                while let Some(inline) = content.front() {
-                    let token_kind = TokenKind::from(inline);
-                    let should_append = !is_multiple || token_kind == own_kind;
-
-                    if should_append {
-                        current_content.append_inline(content.pop_front().unwrap());
-                    } else {
-                        break;
-                    }
-                }
-
-                content
-            }
-        };
-
-        let result_inline = Self::new(current_content, own_kind);
-        (result_inline, rest_of_inlines)
-    }
 }
 
 impl From<PlainContent> for Inline {

diff --git a/inline/src/lexer/mod.rs b/inline/src/lexer/mod.rs
@@ -2,51 +2,42 @@ use std::{iter::Peekable, str::Lines};
 
 use unicode_segmentation::*;
 
+mod resolver;
 mod token;
 
 pub use token::*;
 
 use crate::{Substitute, Substitutor};
 
+use self::resolver::{RawToken, TokenResolver};
+
 /// Used to create a Unimarkup [`Lexer`] over some data structure, most typically over some kind of
 /// string, i.e. [`&str`].
 ///
 /// [`Lexer`]: self::Lexer
 /// [`&str`]: &str
 pub trait Tokenize {
-    /// Creates the `Lexer` from this type.
-    fn lex(&self) -> Lexer;
-
-    /// Creates the `Lexer` from this type starting at the given offset.
-    fn lex_with_offs(&self, pos: Position) -> Lexer {
-        Lexer { pos, ..self.lex() }
-    }
-
-    /// Creates an [`TokenIterator`] from this type.
-    ///
-    /// [`TokenIterator`]: self::TokenIterator
-    fn lex_iter(&self) -> TokenIterator;
+    /// Returns tokens found in self.
+    fn tokens(&self) -> Tokens;
 
-    /// Creates an [`TokenIterator`] from this type starting at the given offset.
-    ///
-    /// [`TokenIterator`]: self::TokenIterator
-    fn lex_iter_with_offs(&self, pos: Position) -> TokenIterator {
-        let lexer = self.lex_with_offs(pos);
-
-        lexer.iter()
-    }
+    /// Returns tokens found in self starting from the given position.
+    fn tokens_with_offs(&self, pos: Position) -> Tokens;
 }
 
 impl<'a> Tokenize for &'a str {
-    fn lex(&self) -> Lexer {
-        Lexer {
+    fn tokens(&self) -> Tokens {
+        let lexer = Lexer {
             input: self,
             pos: Position { line: 1, column: 1 },
-        }
+        };
+
+        Tokens::new(lexer.resolved())
     }
 
-    fn lex_iter(&self) -> TokenIterator {
-        self.lex().iter()
+    fn tokens_with_offs(&self, pos: Position) -> Tokens {
+        let lexer = Lexer { input: self, pos };
+
+        Tokens::new(lexer.resolved())
     }
 }
 
@@ -257,7 +248,7 @@ impl<'a> Lexer<'a> {
     ///
     /// [`TokenIterator`]: self::TokenIterator
     /// [`Lexer`]: self::Lexer
-    pub fn iter(&self) -> TokenIterator<'a> {
+    fn iter(&self) -> TokenIterator<'a> {
         let skip_lines_upto_index = self.pos.line.saturating_sub(1);
         let mut lines = self.input.lines().peekable();
 
@@ -273,6 +264,10 @@ impl<'a> Lexer<'a> {
             substitutor: Substitutor::new(),
         }
     }
+
+    fn resolved(self) -> TokenResolver {
+        TokenResolver::new(self.iter())
+    }
 }
 
 impl<'a> IntoIterator for &'a Lexer<'a> {
@@ -290,7 +285,7 @@ impl<'a> IntoIterator for &'a Lexer<'a> {
 /// [`Symbol`]: self::Symbol
 /// [`Token`]: self::token::Token
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-pub(crate) enum Content {
+pub(crate) enum ContentOption {
     /// Annotates that content should be stored into [`Token`].
     ///
     /// [`Token`]: crate::Token
@@ -411,11 +406,13 @@ impl TokenIterator<'_> {
             |subst| subst.as_str().to_string(),
         );
 
-        let token = TokenBuilder::new(kind)
-            .span(Span::from((start_pos, end_pos)))
-            .space(spacing)
-            .optional_content(content, kind.content_option())
-            .build();
+        let token = Token::with_conditional_content(
+            kind,
+            Span::from((start_pos, end_pos)),
+            spacing,
+            content,
+            kind.content_option(),
+        );
 
         self.index = curr_index;
 
@@ -562,11 +559,12 @@ impl TokenIterator<'_> {
         let temp_idx = self.index;
         self.index = self.pos.column.saturating_sub(1);
 
-        let token = TokenBuilder::new(TokenKind::Plain)
-            .with_content(content)
-            .span(Span::from((start_pos, end_pos)))
-            .space(self.spacing_around(len))
-            .build();
+        let token = Token {
+            kind: TokenKind::Plain,
+            span: Span::from((start_pos, end_pos)),
+            spacing: self.spacing_around(len),
+            content: Some(content),
+        };
 
         self.index = temp_idx;
 
@@ -592,11 +590,12 @@ impl TokenIterator<'_> {
             TokenKind::Newline
         };
 
-        let token = TokenBuilder::new(token_kind)
-            .with_content(String::from(symbol))
-            .span(Span::from((start_pos, end_pos)))
-            .space(Spacing::None)
-            .build();
+        let token = Token {
+            kind: token_kind,
+            span: Span::from((start_pos, end_pos)),
+            spacing: Spacing::None,
+            content: Some(symbol.into()),
+        };
 
         self.index += 1;
         Some(token)
@@ -683,5 +682,42 @@ impl<'a> Iterator for TokenIterator<'a> {
     }
 }
 
+/// TODO: write docs
+#[derive(Debug, Clone)]
+pub struct Tokens {
+    iter: resolver::IntoIter,
+    cache: Option<RawToken>,
+}
+
+impl Tokens {
+    pub(crate) fn new(resolver: TokenResolver) -> Self {
+        Self {
+            iter: resolver.into_iter(),
+            cache: None,
+        }
+    }
+}
+
+impl Iterator for Tokens {
+    type Item = Token;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut unr_token = if let Some(unr_token) = self.cache.take() {
+            unr_token
+        } else {
+            self.iter.next()?
+        };
+
+        match unr_token.pop() {
+            Some(first_part) => {
+                // save remaining part
+                self.cache = Some(unr_token);
+                Some(Token::from(first_part))
+            }
+            _ => Some(Token::from(unr_token)),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests;