Skip to content

Commit

Permalink
chore: merge pull request #77 from Unimarkup/inlines-token-disambigua…
Browse files Browse the repository at this point in the history
…tion

feat: improve lexer and parser
  • Loading branch information
nfejzic committed Jan 6, 2023
2 parents c97e898 + 0518355 commit 21f8024
Show file tree
Hide file tree
Showing 9 changed files with 939 additions and 684 deletions.
4 changes: 2 additions & 2 deletions core/src/elements/inlines.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ impl Render for TokenDelimiters {
TokenKind::Overline => "<span style='text-decoration: overline;'>",
TokenKind::Strikethrough => "<span style='text-decoration: line-through;'>",
TokenKind::Highlight => "<span style='background-color: #ffaaaa;'>",
TokenKind::Verbatim => "<pre><code>",
TokenKind::Verbatim => "<code>",
TokenKind::Quote => "<span class='quote'>",
TokenKind::Math => "<span class='math'>",
TokenKind::OpenParens => "(",
Expand Down Expand Up @@ -47,7 +47,7 @@ impl Render for TokenDelimiters {
TokenKind::Overline => "</span>",
TokenKind::Strikethrough => "</span>",
TokenKind::Highlight => "</span>",
TokenKind::Verbatim => "</code></pre>",
TokenKind::Verbatim => "</code>",
TokenKind::Quote => "</span>",
TokenKind::Math => "</span>",
TokenKind::OpenParens => "(",
Expand Down
66 changes: 1 addition & 65 deletions inline/src/inlines/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::collections::VecDeque;

use crate::{Span, TokenDelimiters, TokenKind};

mod content;
Expand Down Expand Up @@ -85,26 +83,12 @@ impl Inline {
/// [`Inline`]: self::Inline
/// [`TokenKind`]: crate::TokenKind
/// [`InlineContent`]: self::content::InlineContent
pub fn new(mut content: InlineContent<PlainContent, NestedContent>, kind: TokenKind) -> Self {
pub fn new(content: InlineContent<PlainContent, NestedContent>, kind: TokenKind) -> Self {
let consume_as_plain = |content| match content {
InlineContent::Plain(plain_content) => Self::Plain(plain_content),
InlineContent::Nested(nested_content) => Self::Multiple(nested_content),
};

let span = content.span();
if let InlineContent::Nested(ref mut nested) = content {
// try to flatten content more
if nested.content.len() == 1 {
let inline = &mut nested.content[0];

if matches!(inline.as_ref(), InlineContent::Nested(_)) {
content = nested.content.pop_back().unwrap().into_inner();
dbg!(&content);
content.set_span(span);
}
}
}

match kind {
TokenKind::Bold => Self::Bold(content.into()),
TokenKind::Italic => Self::Italic(content.into()),
Expand Down Expand Up @@ -214,13 +198,6 @@ impl Inline {
}
}

/// Checks whether this [`Inline`] is a `Plain` text constructed from multiple other [`Inline`]s.
///
/// [`Inline`]: self::Inline
fn is_multiple(&self) -> bool {
matches!(self, Inline::Multiple(_))
}

/// Consumes this [`Inline`] and returns the inner [`InlineContent`] of it.
///
/// [`Inline`]: self::Inline
Expand Down Expand Up @@ -343,47 +320,6 @@ impl Inline {
| Inline::Substitution(content) => InlineContent::Nested(content),
}
}

/// Merges this [`Inline`] with another into one combined [`Inline`]. Since the other [`Inline`] might
/// contain multiple inlines inside, some of which aren't compatible with this one, the remaining [`Inline`]s
/// are returned in a [`VecDeque`].
///
/// [`Inline`]: self::Inline
/// [`VecDeque`]: std::collections::VecDeque
pub(crate) fn merge(self, next_inline: Inline) -> (Inline, VecDeque<Inline>) {
let own_kind = TokenKind::from(&self);
let is_multiple = next_inline.is_multiple();

let mut current_content = self.into_inner();
let next_content = next_inline.into_inner();

let rest_of_inlines = match next_content {
InlineContent::Plain(plain_content) => {
// merge plains trivially
current_content.append(plain_content.into());
VecDeque::default()
}
InlineContent::Nested(nested_inlines) => {
let mut content = nested_inlines.content;

while let Some(inline) = content.front() {
let token_kind = TokenKind::from(inline);
let should_append = !is_multiple || token_kind == own_kind;

if should_append {
current_content.append_inline(content.pop_front().unwrap());
} else {
break;
}
}

content
}
};

let result_inline = Self::new(current_content, own_kind);
(result_inline, rest_of_inlines)
}
}

impl From<PlainContent> for Inline {
Expand Down
120 changes: 78 additions & 42 deletions inline/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,42 @@ use std::{iter::Peekable, str::Lines};

use unicode_segmentation::*;

mod resolver;
mod token;

pub use token::*;

use crate::{Substitute, Substitutor};

use self::resolver::{RawToken, TokenResolver};

/// Used to create a Unimarkup [`Lexer`] over some data structure, most typically over some kind of
/// string, i.e. [`&str`].
///
/// [`Lexer`]: self::Lexer
/// [`&str`]: &str
pub trait Tokenize {
/// Creates the `Lexer` from this type.
fn lex(&self) -> Lexer;

/// Creates the `Lexer` from this type starting at the given offset.
fn lex_with_offs(&self, pos: Position) -> Lexer {
Lexer { pos, ..self.lex() }
}

/// Creates an [`TokenIterator`] from this type.
///
/// [`TokenIterator`]: self::TokenIterator
fn lex_iter(&self) -> TokenIterator;
/// Returns tokens found in self.
fn tokens(&self) -> Tokens;

/// Creates an [`TokenIterator`] from this type starting at the given offset.
///
/// [`TokenIterator`]: self::TokenIterator
fn lex_iter_with_offs(&self, pos: Position) -> TokenIterator {
let lexer = self.lex_with_offs(pos);

lexer.iter()
}
/// Returns tokens found in self starting from the given position.
fn tokens_with_offs(&self, pos: Position) -> Tokens;
}

impl<'a> Tokenize for &'a str {
fn lex(&self) -> Lexer {
Lexer {
fn tokens(&self) -> Tokens {
let lexer = Lexer {
input: self,
pos: Position { line: 1, column: 1 },
}
};

Tokens::new(lexer.resolved())
}

fn lex_iter(&self) -> TokenIterator {
self.lex().iter()
fn tokens_with_offs(&self, pos: Position) -> Tokens {
let lexer = Lexer { input: self, pos };

Tokens::new(lexer.resolved())
}
}

Expand Down Expand Up @@ -257,7 +248,7 @@ impl<'a> Lexer<'a> {
///
/// [`TokenIterator`]: self::TokenIterator
/// [`Lexer`]: self::Lexer
pub fn iter(&self) -> TokenIterator<'a> {
fn iter(&self) -> TokenIterator<'a> {
let skip_lines_upto_index = self.pos.line.saturating_sub(1);
let mut lines = self.input.lines().peekable();

Expand All @@ -273,6 +264,10 @@ impl<'a> Lexer<'a> {
substitutor: Substitutor::new(),
}
}

fn resolved(self) -> TokenResolver {
TokenResolver::new(self.iter())
}
}

impl<'a> IntoIterator for &'a Lexer<'a> {
Expand All @@ -290,7 +285,7 @@ impl<'a> IntoIterator for &'a Lexer<'a> {
/// [`Symbol`]: self::Symbol
/// [`Token`]: self::token::Token
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) enum Content {
pub(crate) enum ContentOption {
/// Annotates that content should be stored into [`Token`].
///
/// [`Token`]: crate::Token
Expand Down Expand Up @@ -411,11 +406,13 @@ impl TokenIterator<'_> {
|subst| subst.as_str().to_string(),
);

let token = TokenBuilder::new(kind)
.span(Span::from((start_pos, end_pos)))
.space(spacing)
.optional_content(content, kind.content_option())
.build();
let token = Token::with_conditional_content(
kind,
Span::from((start_pos, end_pos)),
spacing,
content,
kind.content_option(),
);

self.index = curr_index;

Expand Down Expand Up @@ -562,11 +559,12 @@ impl TokenIterator<'_> {
let temp_idx = self.index;
self.index = self.pos.column.saturating_sub(1);

let token = TokenBuilder::new(TokenKind::Plain)
.with_content(content)
.span(Span::from((start_pos, end_pos)))
.space(self.spacing_around(len))
.build();
let token = Token {
kind: TokenKind::Plain,
span: Span::from((start_pos, end_pos)),
spacing: self.spacing_around(len),
content: Some(content),
};

self.index = temp_idx;

Expand All @@ -592,11 +590,12 @@ impl TokenIterator<'_> {
TokenKind::Newline
};

let token = TokenBuilder::new(token_kind)
.with_content(String::from(symbol))
.span(Span::from((start_pos, end_pos)))
.space(Spacing::None)
.build();
let token = Token {
kind: token_kind,
span: Span::from((start_pos, end_pos)),
spacing: Spacing::None,
content: Some(symbol.into()),
};

self.index += 1;
Some(token)
Expand Down Expand Up @@ -683,5 +682,42 @@ impl<'a> Iterator for TokenIterator<'a> {
}
}

/// TODO: write docs
#[derive(Debug, Clone)]
pub struct Tokens {
iter: resolver::IntoIter,
cache: Option<RawToken>,
}

impl Tokens {
pub(crate) fn new(resolver: TokenResolver) -> Self {
Self {
iter: resolver.into_iter(),
cache: None,
}
}
}

impl Iterator for Tokens {
type Item = Token;

fn next(&mut self) -> Option<Self::Item> {
let mut unr_token = if let Some(unr_token) = self.cache.take() {
unr_token
} else {
self.iter.next()?
};

match unr_token.pop() {
Some(first_part) => {
// save remaining part
self.cache = Some(unr_token);
Some(Token::from(first_part))
}
_ => Some(Token::from(unr_token)),
}
}
}

#[cfg(test)]
mod tests;
Loading

0 comments on commit 21f8024

Please sign in to comment.