Skip to content

Commit

Permalink
feat: implement basic lexer of plain text
Browse files Browse the repository at this point in the history
Add the necessary `struct`s and modules for Unimarkup inlines lexing.
Also implement the most basic lexing - scanning and storing plain text
into `Token`. Use `unicode_segmentation` crate for handling of
unicode graphemes.
  • Loading branch information
nfejzic committed Mar 25, 2022
1 parent c78f19a commit 329ff68
Show file tree
Hide file tree
Showing 3 changed files with 369 additions and 0 deletions.
222 changes: 222 additions & 0 deletions inline/src/lexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
mod token;

use std::str::Lines;

use unicode_segmentation::*;

pub use token::*;

pub trait Tokenize {
fn lex(&self) -> Lexer;

fn lex_with_offs(&self, pos: Position) -> Lexer {
Lexer { pos, ..self.lex() }
}
}

impl<'a, T> Tokenize for T
where
T: AsRef<str> + 'a,
{
fn lex(&self) -> Lexer {
Lexer {
input: self.as_ref(),
pos: Position { line: 0, column: 1 },
}
}
}

/*
* This is a paragraph with \n newline inside
*
* OUTPUT: This is a paragraph with n newline inside
*
* .chars() -> Chars: Iterator
*
* \t, \n, \\, \*
*
* **Some text* bla**
*
* Bold, Plain, Italic
*
*
*
* */

pub struct Lexer<'a> {
input: &'a str,
pos: Position,
}

impl<'a> Lexer<'a> {
const ESC: &'static str = "\\";
const STAR: &'static str = "*";
const ULINE: &'static str = "_";
const CARET: &'static str = "^";
const TICK: &'static str = "`";

pub fn iter(&self) -> TokenIterator<'a> {
TokenIterator {
lines: self.input.lines(),
curr: Vec::new(),
index: 0,
pos: self.pos,
}
}
}

impl<'a> IntoIterator for &'a Lexer<'a> {
type Item = Token;

type IntoIter = TokenIterator<'a>;

fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}

pub struct TokenIterator<'a> {
lines: Lines<'a>,
curr: Vec<&'a str>,
index: usize,
pos: Position, // in input text
}

impl TokenIterator<'_> {
fn is_end_of_line(&self) -> bool {
self.index >= self.curr.len()
}

fn load_next_line(&mut self) -> bool {
// remove last line from cache
self.curr.clear();

if let Some(next_line) = self.lines.next() {
// load next line into cache
self.curr.extend(next_line.graphemes(true));

// update position
self.pos.line += 1;
self.pos.column = 1;

// update index into current line
self.index = 0;

return true;
}

// two cases:
// 1. next grapheme is keyword -> generate some token
// 2. next grapheme is not a keyword -> it is plain text

false
}

fn lex_plain(&mut self) -> Option<Token> {
let start_pos = self.pos;
let mut content = String::with_capacity(self.curr.len());

// multiple cases:
// 1. got to end of line -> interpret as end of token
// 2. escape grapheme found -> end interpretation
// 3. some keyword found -> end interpretation
// 4. any other grapheme -> consume into plain

while let Some(grapheme) = self.curr.get(self.index) {
if grapheme.is_esc() || grapheme.is_keyword() {
// skip the escape character and continue from next character
break;
} else {
content.push_str(grapheme);
self.index += 1;
}
}

let mut end_pos = self.pos;
end_pos.column += self.index;

let token = Token::new(TokenKind::Plain)
.with_content(content)
.span(Span::from((start_pos, end_pos)))
.space(Spacing::None);

Some(token)
}
}

impl<'a> Iterator for TokenIterator<'a> {
type Item = Token;

fn next(&mut self) -> Option<Self::Item> {
if self.is_end_of_line() && !self.load_next_line() {
return None;
}

if let Some(grapheme) = self.curr.get(self.index) {
if grapheme.is_keyword() {
// TODO: lex the keyword
todo!()
} else if grapheme.is_esc() {
// TODO: lex escape
} else {
return self.lex_plain();
}
}

None
}
}

trait IsKeyword {
fn is_keyword(&self) -> bool;
fn is_esc(&self) -> bool;
}

impl IsKeyword for &str {
fn is_keyword(&self) -> bool {
[Lexer::STAR, Lexer::ULINE, Lexer::CARET, Lexer::TICK].contains(self)
}

fn is_esc(&self) -> bool {
*self == "\\"
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn lines() {
let input = r#"first line
third line"#;

assert_eq!(input.lines().count(), 3);
}

#[test]
fn into_iter() {
let lexer = "Some string".lex();

for token in &lexer {
println!("{:?}", token);
}

assert_eq!(lexer.into_iter().count(), 1);
}

#[test]
fn consume_plain() {
let lexer = "Some string".lex();

println!("\nTesting consume_plain: \n\n");

for token in &lexer {
println!("{token:?}");
// assert_eq!(token.kind(), TokenKind::Plain)
}

println!("\nEnd consume_plain test\n\n");
}
}
145 changes: 145 additions & 0 deletions inline/src/lexer/token.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
use std::ops::{Add, AddAssign, Sub, SubAssign};

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
kind: TokenKind,
span: Span,
spacing: Spacing,
content: Option<String>,
}

impl Token {
pub fn new(kind: TokenKind) -> Self {
Self {
kind,
span: Span::default(),
spacing: Spacing::default(),
content: None,
}
}

pub fn with_content(mut self, content: String) -> Self {
self.content = Some(content);
self
}

pub fn span(mut self, span: Span) -> Self {
self.span = span;
self
}

pub fn space(mut self, spacing: Spacing) -> Self {
self.spacing = spacing;
self
}

pub fn kind(&self) -> &TokenKind {
&self.kind
}

pub fn spacing(&self) -> Spacing {
self.spacing
}
}

impl AsRef<str> for Token {
fn as_ref(&self) -> &str {
match self.content {
Some(ref content) => content,
None => self.kind.as_ref(),
}
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
Bold,
Italic,
Newline,
Plain,
}

impl AsRef<str> for TokenKind {
fn as_ref(&self) -> &str {
match *self {
TokenKind::Bold => "**",
TokenKind::Italic => "*",
TokenKind::Newline => "\n",
TokenKind::Plain => "",
}
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Spacing {
Pre,
Post,
Both,
None,
}

impl Default for Spacing {
fn default() -> Self {
Self::None
}
}

#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
pub struct Span {
start: Position,
end: Position,
}

impl From<(Position, Position)> for Span {
fn from((start, end): (Position, Position)) -> Self {
Self { start, end }
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Position {
pub line: usize,
pub column: usize,
}

impl Default for Position {
fn default() -> Self {
Self { line: 1, column: 1 }
}
}

impl Add for Position {
type Output = Position;

fn add(self, rhs: Self) -> Self::Output {
Position {
line: self.line + rhs.line,
column: self.column + rhs.column,
}
}
}

impl AddAssign for Position {
fn add_assign(&mut self, rhs: Self) {
self.line += rhs.line;
self.column += rhs.column;
}
}

impl Sub for Position {
type Output = Position;

fn sub(self, rhs: Self) -> Self::Output {
Position {
line: self.line - rhs.line,
column: self.column - rhs.column,
}
}
}

impl SubAssign for Position {
fn sub_assign(&mut self, rhs: Self) {
self.line -= rhs.line;
self.column -= rhs.column;
}
}
2 changes: 2 additions & 0 deletions inline/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
mod lexer;

pub use lexer::*;

0 comments on commit 329ff68

Please sign in to comment.