-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: implement basic lexer of plain text
Add the necessary `struct`s and modules for Unimarkup inlines lexing. Also implement the most basic lexing - scanning and storing plain text into `Token`. Use `unicode_segmentation` crate for handling of unicode graphemes.
- Loading branch information
Showing
3 changed files
with
369 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,222 @@ | ||
mod token; | ||
|
||
use std::str::Lines; | ||
|
||
use unicode_segmentation::*; | ||
|
||
pub use token::*; | ||
|
||
pub trait Tokenize { | ||
fn lex(&self) -> Lexer; | ||
|
||
fn lex_with_offs(&self, pos: Position) -> Lexer { | ||
Lexer { pos, ..self.lex() } | ||
} | ||
} | ||
|
||
impl<'a, T> Tokenize for T | ||
where | ||
T: AsRef<str> + 'a, | ||
{ | ||
fn lex(&self) -> Lexer { | ||
Lexer { | ||
input: self.as_ref(), | ||
pos: Position { line: 0, column: 1 }, | ||
} | ||
} | ||
} | ||
|
||
/* | ||
* This is a paragraph with \n newline inside | ||
* | ||
* OUTPUT: This is a paragraph with n newline inside | ||
* | ||
* .chars() -> Chars: Iterator | ||
* | ||
* \t, \n, \\, \* | ||
* | ||
* **Some text* bla** | ||
* | ||
* Bold, Plain, Italic | ||
* | ||
* | ||
* | ||
* */ | ||
|
||
pub struct Lexer<'a> { | ||
input: &'a str, | ||
pos: Position, | ||
} | ||
|
||
impl<'a> Lexer<'a> { | ||
const ESC: &'static str = "\\"; | ||
const STAR: &'static str = "*"; | ||
const ULINE: &'static str = "_"; | ||
const CARET: &'static str = "^"; | ||
const TICK: &'static str = "`"; | ||
|
||
pub fn iter(&self) -> TokenIterator<'a> { | ||
TokenIterator { | ||
lines: self.input.lines(), | ||
curr: Vec::new(), | ||
index: 0, | ||
pos: self.pos, | ||
} | ||
} | ||
} | ||
|
||
impl<'a> IntoIterator for &'a Lexer<'a> { | ||
type Item = Token; | ||
|
||
type IntoIter = TokenIterator<'a>; | ||
|
||
fn into_iter(self) -> Self::IntoIter { | ||
self.iter() | ||
} | ||
} | ||
|
||
pub struct TokenIterator<'a> { | ||
lines: Lines<'a>, | ||
curr: Vec<&'a str>, | ||
index: usize, | ||
pos: Position, // in input text | ||
} | ||
|
||
impl TokenIterator<'_> { | ||
fn is_end_of_line(&self) -> bool { | ||
self.index >= self.curr.len() | ||
} | ||
|
||
fn load_next_line(&mut self) -> bool { | ||
// remove last line from cache | ||
self.curr.clear(); | ||
|
||
if let Some(next_line) = self.lines.next() { | ||
// load next line into cache | ||
self.curr.extend(next_line.graphemes(true)); | ||
|
||
// update position | ||
self.pos.line += 1; | ||
self.pos.column = 1; | ||
|
||
// update index into current line | ||
self.index = 0; | ||
|
||
return true; | ||
} | ||
|
||
// two cases: | ||
// 1. next grapheme is keyword -> generate some token | ||
// 2. next grapheme is not a keyword -> it is plain text | ||
|
||
false | ||
} | ||
|
||
fn lex_plain(&mut self) -> Option<Token> { | ||
let start_pos = self.pos; | ||
let mut content = String::with_capacity(self.curr.len()); | ||
|
||
// multiple cases: | ||
// 1. got to end of line -> interpret as end of token | ||
// 2. escape grapheme found -> end interpretation | ||
// 3. some keyword found -> end interpretation | ||
// 4. any other grapheme -> consume into plain | ||
|
||
while let Some(grapheme) = self.curr.get(self.index) { | ||
if grapheme.is_esc() || grapheme.is_keyword() { | ||
// skip the escape character and continue from next character | ||
break; | ||
} else { | ||
content.push_str(grapheme); | ||
self.index += 1; | ||
} | ||
} | ||
|
||
let mut end_pos = self.pos; | ||
end_pos.column += self.index; | ||
|
||
let token = Token::new(TokenKind::Plain) | ||
.with_content(content) | ||
.span(Span::from((start_pos, end_pos))) | ||
.space(Spacing::None); | ||
|
||
Some(token) | ||
} | ||
} | ||
|
||
impl<'a> Iterator for TokenIterator<'a> { | ||
type Item = Token; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
if self.is_end_of_line() && !self.load_next_line() { | ||
return None; | ||
} | ||
|
||
if let Some(grapheme) = self.curr.get(self.index) { | ||
if grapheme.is_keyword() { | ||
// TODO: lex the keyword | ||
todo!() | ||
} else if grapheme.is_esc() { | ||
// TODO: lex escape | ||
} else { | ||
return self.lex_plain(); | ||
} | ||
} | ||
|
||
None | ||
} | ||
} | ||
|
||
trait IsKeyword { | ||
fn is_keyword(&self) -> bool; | ||
fn is_esc(&self) -> bool; | ||
} | ||
|
||
impl IsKeyword for &str { | ||
fn is_keyword(&self) -> bool { | ||
[Lexer::STAR, Lexer::ULINE, Lexer::CARET, Lexer::TICK].contains(self) | ||
} | ||
|
||
fn is_esc(&self) -> bool { | ||
*self == "\\" | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn lines() { | ||
let input = r#"first line | ||
third line"#; | ||
|
||
assert_eq!(input.lines().count(), 3); | ||
} | ||
|
||
#[test] | ||
fn into_iter() { | ||
let lexer = "Some string".lex(); | ||
|
||
for token in &lexer { | ||
println!("{:?}", token); | ||
} | ||
|
||
assert_eq!(lexer.into_iter().count(), 1); | ||
} | ||
|
||
#[test] | ||
fn consume_plain() { | ||
let lexer = "Some string".lex(); | ||
|
||
println!("\nTesting consume_plain: \n\n"); | ||
|
||
for token in &lexer { | ||
println!("{token:?}"); | ||
// assert_eq!(token.kind(), TokenKind::Plain) | ||
} | ||
|
||
println!("\nEnd consume_plain test\n\n"); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
use std::ops::{Add, AddAssign, Sub, SubAssign}; | ||
|
||
#[derive(Debug, Clone, PartialEq, Eq)] | ||
pub struct Token { | ||
kind: TokenKind, | ||
span: Span, | ||
spacing: Spacing, | ||
content: Option<String>, | ||
} | ||
|
||
impl Token { | ||
pub fn new(kind: TokenKind) -> Self { | ||
Self { | ||
kind, | ||
span: Span::default(), | ||
spacing: Spacing::default(), | ||
content: None, | ||
} | ||
} | ||
|
||
pub fn with_content(mut self, content: String) -> Self { | ||
self.content = Some(content); | ||
self | ||
} | ||
|
||
pub fn span(mut self, span: Span) -> Self { | ||
self.span = span; | ||
self | ||
} | ||
|
||
pub fn space(mut self, spacing: Spacing) -> Self { | ||
self.spacing = spacing; | ||
self | ||
} | ||
|
||
pub fn kind(&self) -> &TokenKind { | ||
&self.kind | ||
} | ||
|
||
pub fn spacing(&self) -> Spacing { | ||
self.spacing | ||
} | ||
} | ||
|
||
impl AsRef<str> for Token { | ||
fn as_ref(&self) -> &str { | ||
match self.content { | ||
Some(ref content) => content, | ||
None => self.kind.as_ref(), | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
pub enum TokenKind { | ||
Bold, | ||
Italic, | ||
Newline, | ||
Plain, | ||
} | ||
|
||
impl AsRef<str> for TokenKind { | ||
fn as_ref(&self) -> &str { | ||
match *self { | ||
TokenKind::Bold => "**", | ||
TokenKind::Italic => "*", | ||
TokenKind::Newline => "\n", | ||
TokenKind::Plain => "", | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
pub enum Spacing { | ||
Pre, | ||
Post, | ||
Both, | ||
None, | ||
} | ||
|
||
impl Default for Spacing { | ||
fn default() -> Self { | ||
Self::None | ||
} | ||
} | ||
|
||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] | ||
pub struct Span { | ||
start: Position, | ||
end: Position, | ||
} | ||
|
||
impl From<(Position, Position)> for Span { | ||
fn from((start, end): (Position, Position)) -> Self { | ||
Self { start, end } | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
pub struct Position { | ||
pub line: usize, | ||
pub column: usize, | ||
} | ||
|
||
impl Default for Position { | ||
fn default() -> Self { | ||
Self { line: 1, column: 1 } | ||
} | ||
} | ||
|
||
impl Add for Position { | ||
type Output = Position; | ||
|
||
fn add(self, rhs: Self) -> Self::Output { | ||
Position { | ||
line: self.line + rhs.line, | ||
column: self.column + rhs.column, | ||
} | ||
} | ||
} | ||
|
||
impl AddAssign for Position { | ||
fn add_assign(&mut self, rhs: Self) { | ||
self.line += rhs.line; | ||
self.column += rhs.column; | ||
} | ||
} | ||
|
||
impl Sub for Position { | ||
type Output = Position; | ||
|
||
fn sub(self, rhs: Self) -> Self::Output { | ||
Position { | ||
line: self.line - rhs.line, | ||
column: self.column - rhs.column, | ||
} | ||
} | ||
} | ||
|
||
impl SubAssign for Position { | ||
fn sub_assign(&mut self, rhs: Self) { | ||
self.line -= rhs.line; | ||
self.column -= rhs.column; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
mod lexer; | ||
|
||
pub use lexer::*; |