From 57fb41d41a15db6e39a0414cea249bc29dc3836c Mon Sep 17 00:00:00 2001 From: Manuel Hatzl Date: Wed, 23 Mar 2022 12:46:00 +0100 Subject: [PATCH] doc(inline): add documentation for the inline crate --- inline/src/ast/collect.rs | 14 +++++++- inline/src/ast/mod.rs | 28 +++++++++++++--- inline/src/ast/substitutions.rs | 6 +--- inline/src/lib.rs | 2 ++ inline/src/tokenizer/mod.rs | 48 +++++++++++++++++++++------ inline/src/tokenizer/tokens.rs | 58 +++++++++++++++++++++++++++++---- 6 files changed, 130 insertions(+), 26 deletions(-) diff --git a/inline/src/ast/collect.rs b/inline/src/ast/collect.rs index 5015e764..7be89f15 100644 --- a/inline/src/ast/collect.rs +++ b/inline/src/ast/collect.rs @@ -1,14 +1,22 @@ +//! This module provides functionality to create a Unimarkup inline AST out of a given list of tokens. + use crate::tokenizer::{Position, TokenKind, Tokens, Newline}; use super::{Span, NestedInline, InlineKind, FlatInline, substitutions::DirectSubstitution, Inline, FlattenInlineKind}; - +/// Struct to store partial collected inline tokens. +/// +/// Needed for nested tokens. pub(crate) struct InlineSection { + /// Partially collected inline tokens. pub(crate) content: Inline, + /// End position of the last inline token of the section. pub(crate) end: Position, } +/// Trait to create an inline AST. pub(crate) trait InlineAst { + /// Function to create an inline AST from a given input. fn collect(self) -> Inline; } @@ -19,6 +27,10 @@ impl InlineAst for Tokens { } } +/// Function to collect inline elements up until a certain token is reached. +/// +/// Note: The token of kind `token_kind` is the last token of the returned section, if it was found. +/// Otherwise, the given list of tokens is fully emptied. pub(crate) fn collect_until(tokens: &mut Tokens, token_kind: TokenKind) -> InlineSection { let mut inline = Vec::new(); let mut end: Position = Position::default(); diff --git a/inline/src/ast/mod.rs b/inline/src/ast/mod.rs index 6745df30..aa6751a0 100644 --- a/inline/src/ast/mod.rs +++ b/inline/src/ast/mod.rs @@ -1,3 +1,5 @@ +//! This module provides types and functionality to create a Unimarkup inline AST out of a given list of tokens. + use crate::tokenizer::{Position, TokenKind}; pub(crate) mod collect; @@ -6,45 +8,63 @@ mod substitutions; /// Represents an AST of Unimarkup inline elements pub type Inline = Vec; - +/// Convenient function to convert a string into plain inline. pub fn flat_inline(s: &str) -> Inline { vec![InlineKind::Plain(FlatInline{ content: s.to_string(), span: Span::default() })] } - - +/// Struct to set the span of an inline element in a given input. +/// +/// Note: If the inline element only consists of one grapheme, start and end point to the same position. #[derive(Debug, Default, Clone, PartialEq, Copy)] pub struct Span { + /// The start position of an inline element. pub start: Position, + /// The end position of an inline element. pub end: Position, } +/// Struct representing inline elements that allow nesting. #[derive(Debug, Default, Clone, PartialEq)] pub struct NestedInline { pub content: Vec, pub span: Span } +/// Struct representing inline elements that do not allow nesting. #[derive(Debug, Default, Clone, PartialEq)] pub struct FlatInline { pub content: String, pub span: Span, } +/// Enum representing all supported Unimarkup inline elements. #[derive(Debug, Clone, PartialEq)] pub enum InlineKind { + /// Representing the bold inline element. Bold(NestedInline), + /// Representing the italic inline element. Italic(NestedInline), + /// Representing the combined bold and italic inline element. BoldItalic(NestedInline), + /// Representing the verbatim inline element. Verbatim(FlatInline), + /// Representing plain text. Plain(FlatInline), + /// Representing excplicit newlines. EscapedNewLine(FlatInline), + /// Representing explicit spaces. EscapedSpace(FlatInline), } +/// Trait to flatten inline elements. + pub trait FlattenInlineKind { + /// This function converts an inline element back into its original plain representation. + /// + /// e.g. `Bold(Plain(text))` --> `**text**` fn flatten(self) -> String; -} +} impl FlattenInlineKind for Vec { fn flatten(self) -> String { diff --git a/inline/src/ast/substitutions.rs b/inline/src/ast/substitutions.rs index 1d5c517e..d9d1784c 100644 --- a/inline/src/ast/substitutions.rs +++ b/inline/src/ast/substitutions.rs @@ -1,15 +1,11 @@ - +//! Defines possible direct substitutions. /// Trait for direct substitution pub trait DirectSubstitution { /// Substitutes supported arrows or leaves given input unchanged, if no supported arrow matched. - /// - /// - `possible_arrow` ... String that is tried to be substituted fn substitute_arrow(self) -> Self; /// Substitutes supported emojis or leaves given input unchanged, if no supported emoji matched. - /// - /// - `possible_emoji` ... String that is tried to be substituted fn substitute_emoji(self) -> Self; } diff --git a/inline/src/lib.rs b/inline/src/lib.rs index fd8b5f9b..a4f07e93 100644 --- a/inline/src/lib.rs +++ b/inline/src/lib.rs @@ -1,3 +1,5 @@ +//! This library provides functionality to get a Unimarkup inline AST from a given string + use ast::collect::InlineAst; use error::InlineError; diff --git a/inline/src/tokenizer/mod.rs b/inline/src/tokenizer/mod.rs index 5e4d6362..070a7462 100644 --- a/inline/src/tokenizer/mod.rs +++ b/inline/src/tokenizer/mod.rs @@ -1,3 +1,8 @@ +//! This module provides functionality to tokenize a given &str input. +//! The resulting list of tokens is a flat tokenized representation. +//! +//! e.g. `*text*` --> `[ItalicOpen][Plain][ItalicClose]` + use std::{collections::{HashMap, hash_map::Entry::Vacant}, cmp::min}; use unicode_segmentation::{Graphemes, UnicodeSegmentation}; @@ -7,17 +12,26 @@ pub use tokens::*; use crate::error::InlineError; - +/// Struct to link to the grapheme position of a token in the given input. #[derive(Debug, Default, Clone, PartialEq, Copy)] pub struct Position { + /// Line number in the given input. pub line: usize, + /// Column in the given input. pub column: usize, } - +/// Trait to convert a given input into a list of tokens. pub trait Tokenizer { + /// Takes an input and converts it into a list of tokens. + /// + /// Returns an error if inline constraints are violated. fn tokenize(self) -> Result; + /// Takes an input and an offset to convert the input into a list of tokens, + /// where the first token starts at the given offset. + /// + /// Returns an error if inline constraints are violated. fn tokenize_with_offset(self, offset: Position) -> Result; } @@ -38,14 +52,19 @@ impl Tokenizer for &str { } } +/// Internal structure to keep track of the tokenization process. #[derive(Debug)] struct Tokenized<'a> { + /// Input converted to a grapheme iterator. graphemes: Graphemes<'a>, + /// List of tokens that were tokenized so far. tokens: Vec::, + /// Map of open tokens that were not yet closed open_tokens: HashMap::, + /// The position inside the input of the current token being tokenized. cur_pos: Position, + /// Flag indicating that a grapheme must be escaped. escape_active: bool, - open_verbatim: bool, } impl<'a> From<(&'a str, Position)> for Tokenized<'a> { @@ -56,7 +75,6 @@ impl<'a> From<(&'a str, Position)> for Tokenized<'a> { open_tokens: Default::default(), cur_pos: offset, escape_active: false, - open_verbatim: false, } } } @@ -97,6 +115,7 @@ fn tokenize_until(tokenized: &mut Tokenized, token_kind: TokenKind) -> Result<() Ok(()) } +/// Handles verbatim tokens. fn update_accent(tokenized: &mut Tokenized, grapheme: &str) { if let Some(last) = tokenized.tokens.last() { tokenized.cur_pos.column += last.length(); @@ -106,17 +125,15 @@ fn update_accent(tokenized: &mut Tokenized, grapheme: &str) { true => { let new_token = Token{ kind: TokenKind::VerbatimClose, content: grapheme.to_string(), position: tokenized.cur_pos }; tokenized.tokens.push(new_token); - tokenized.open_verbatim = false; }, false => { let new_token = Token{ kind: TokenKind::VerbatimOpen, content: grapheme.to_string(), position: tokenized.cur_pos }; tokenized.tokens.push(new_token); - tokenized.open_verbatim = true; }, } } - +/// Updates the list of tokens by handling the next grapheme of the input. fn update_tokens(tokenized: &mut Tokenized, grapheme: &str) -> Result<(), InlineError> { if tokenized.escape_active { update_escaped(tokenized, grapheme); @@ -147,6 +164,10 @@ fn update_tokens(tokenized: &mut Tokenized, grapheme: &str) -> Result<(), Inline Ok(()) } +/// Handles text group tokenization by taking precedence over inline formattings. +/// This is achieved by recursive tokenization expecting text group close token. +/// +/// Note: The recursive approach enforces the closing constraint. fn open_text_group(tokenized: &mut Tokenized, grapheme: &str) -> Result<(), InlineError> { if let Some(last) = tokenized.tokens.last() { tokenized.cur_pos.column += last.length(); @@ -174,6 +195,7 @@ fn open_text_group(tokenized: &mut Tokenized, grapheme: &str) -> Result<(), Inli Ok(()) } +/// Function to close a text group if possible. fn try_closing_text_group(tokenized: &mut Tokenized, grapheme: &str) { if tokenized.open_tokens.remove(&TokenKind::TextGroupOpen).is_some() { if let Some(last) = tokenized.tokens.last() { @@ -192,8 +214,7 @@ fn try_closing_text_group(tokenized: &mut Tokenized, grapheme: &str) { } } - -/// Function removes any dangling open token between open/close tokens of the last fix token, if it is a closing one +/// Function removes any dangling open token between open/close tokens of the last fix token, if it is a closing one. fn try_closing_fixated_token(tokenized: &mut Tokenized) { if let Some(last) = tokenized.tokens.last() { let open_index; @@ -257,7 +278,7 @@ fn try_closing_fixated_token(tokenized: &mut Tokenized) { /// Enteres the last fixed token into the open token hashmap, if it is an open token. /// -/// Note: Enforces open token contraints, changing a token to plain if a constraint is violated +/// Note: Enforces open token contraints, changing a token to plain if a constraint is violated. fn update_open_map(tokenized: &mut Tokenized, next_token_is_space_or_newline: bool) { if let Some(mut prev) = tokenized.tokens.pop() { // Makes sure that no two open tokens of the same kind are before one closing one @@ -295,6 +316,7 @@ fn update_open_map(tokenized: &mut Tokenized, next_token_is_space_or_newline: bo } } +/// Handles plain text. fn update_plain(tokenized: &mut Tokenized, grapheme: &str) { if let Some(last) = tokenized.tokens.last_mut() { if last.kind == TokenKind::Plain { @@ -310,6 +332,7 @@ fn update_plain(tokenized: &mut Tokenized, grapheme: &str) { } } +/// Handles escaped graphemes. fn update_escaped(tokenized: &mut Tokenized, grapheme: &str) { if let Some(last) = tokenized.tokens.last() { tokenized.cur_pos.column += last.length(); @@ -318,6 +341,7 @@ fn update_escaped(tokenized: &mut Tokenized, grapheme: &str) { tokenized.cur_pos.column += 1; // add backslash length offset for next token start } +/// Handles graphemes with Unicode whitespace property that are not a newline. fn update_space(tokenized: &mut Tokenized, grapheme: &str) { if let Some(last) = tokenized.tokens.last_mut() { if last.kind == TokenKind::Space { @@ -333,6 +357,7 @@ fn update_space(tokenized: &mut Tokenized, grapheme: &str) { } } +/// Handles newlines. fn update_newline(tokenized: &mut Tokenized, grapheme: &str) { if let Some(last) = tokenized.tokens.last() { tokenized.cur_pos.column += last.length(); @@ -344,6 +369,7 @@ fn update_newline(tokenized: &mut Tokenized, grapheme: &str) { tokenized.cur_pos.column = 0; } +/// Handles bold, italic and any combination of them. fn update_asterisk(tokenized: &mut Tokenized, grapheme: &str) { match tokenized.tokens.pop() { Some(mut last) => { @@ -520,6 +546,8 @@ fn update_asterisk(tokenized: &mut Tokenized, grapheme: &str) { } } +/// Cleans up open tokens. +/// /// Remaining open tokens that have no matching close token get converted to plain. /// Neighboring plain tokens get merged with the open token. fn cleanup_loose_open_tokens(tokenized: &mut Tokenized) { diff --git a/inline/src/tokenizer/tokens.rs b/inline/src/tokenizer/tokens.rs index d748cffe..affb43b9 100644 --- a/inline/src/tokenizer/tokens.rs +++ b/inline/src/tokenizer/tokens.rs @@ -1,17 +1,27 @@ +//! Defines all tokens used for tokenization. + use unicode_segmentation::UnicodeSegmentation; use super::Position; +/// Type representing a list of tokens pub type Tokens = Vec; +/// Token structure representing all supported inline elements with their +/// content and position inside a given input. #[derive(Debug, Default, Clone, PartialEq)] pub struct Token { + /// The token kind identifies the token parts of an Unimarkup inline element pub kind: TokenKind, + /// The content of the token pub content: String, + /// The starting position of this token inside a given input pub position: Position, } impl Token { + /// Returns the content length of a token. + /// The length is the number of Unicode graphemes inside the content. pub fn length(&self) -> usize { if self.kind == TokenKind::NewLine { return 0; @@ -19,64 +29,97 @@ impl Token { self.content.graphemes(true).count() } + /// Shows if a token is of kind space or newline. pub fn is_space_or_newline(&self) -> bool { self.kind.is_space_or_newline() } + /// Shows if a token is closing a scope inside a given input. + /// Closing scopes may be closing text groups, closing attribute blocks, ... pub fn closes_scope(&self) -> bool { self.kind == TokenKind::TextGroupClose } } +/// Enum defining all special single graphemes understood by Unimarkup. #[derive(Debug, Clone, PartialEq)] pub enum SingleTokenKind { + /// Default kind for all non-special graphemes. Plain, + /// Represents a newline grapheme. Newline, + /// Represents a grapheme that has the Unicode whitespace property and is not a newline. Space, + /// Represents `\`. Backslash, // ExclamationMark, // Ampersand, // Colon, // Caret, // Underscore, + /// Represents `*`. Asterisk, // Plus, + /// Represents `` ` ``. Accent, + /// Represents `[`. LeftSquareBracket, + /// Represents `]`. RightSquareBracket, } +/// Enum representing tokens that are part of Unimarkup inline elements. #[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)] pub enum TokenKind { + /// Represents the open part of bold inline formatting. BoldOpen, + /// Represents the closing part of bold inline formatting. BoldClose, + /// Represents the open part of italic inline formatting. ItalicOpen, + /// Represents the closing part of italic inline formatting. ItalicClose, + /// Represents the combined open part of bold and italic inline formatting. BoldItalicOpen, + /// Represents the combined closing part of bold and italic inline formatting. BoldItalicClose, + /// Represents the open part of verbatim inline formatting. VerbatimOpen, + /// Represents the closing part of verbatim inline formatting. VerbatimClose, + /// Represents a plain text part. Plain, + /// Represents the open part of an inline emoji shortcut. EmojiOpen, // EmojiClose, + /// Represents a grapheme that is escaped by a backslash. EscapedGrapheme, + /// Represents a newline as defined by `is_newline()`. NewLine, + /// Represents a grapheme that has the Unicode whitespace property and is not a newline. Space, // CommentOpen, // CommentClose, // DirectUnicode, + /// Represents the open part of an inline text group. TextGroupOpen, + /// Represents the closing part of an inline text group. TextGroupClose, + /// Represents the end of a given input. Eoi, } impl Default for TokenKind { - fn default() -> Self { - TokenKind::Plain - } + /// Returns `Plain` as default token. + fn default() -> Self { + TokenKind::Plain + } } impl TokenKind { + /// Returns the string representation for a token. + /// + /// e.g. `**` for BoldOpen and BoldClose. pub fn as_str(&self) -> &'static str { match *self { TokenKind::BoldOpen => "**", @@ -104,13 +147,17 @@ impl TokenKind { } } + /// Shows if a token is either a space or newline. pub fn is_space_or_newline(&self) -> bool { self == &TokenKind::Space || self == &TokenKind::NewLine } } - +/// Trait to convert a type into a single token. pub trait AsSingleTokenKind { + /// Converts given type into a SingleTokenKind. + /// + /// e.g. `*` --> `SingleTokenKind::Asterisk` fn as_single_token_kind(&self) -> SingleTokenKind; } @@ -135,12 +182,11 @@ impl AsSingleTokenKind for &str { } pub trait Newline { + /// Note: Only temporary solution until rust supports is_newline() per default. fn is_newline(&self) -> bool; } impl Newline for &str { - /// Note: Only temporary solution until rust supports is_newline() per default. - /// /// Treats `\n`, `\r\n` and `\r` as one newline. fn is_newline(&self) -> bool { let s = *self;