refactor: Move Token next to Expr (#4521)

PRQL · Jun 5, 2024 · 61dcf39 · 61dcf39
1 parent 6f711de
commit 61dcf39
Show file tree

Hide file tree

Showing 18 changed files with 283 additions and 285 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -48,6 +48,7 @@ consolidate-commits = true
 
 [workspace.dependencies]
 anyhow = "1.0.86"
+enum-as-inner = "0.6.0"
 insta = {version = "1.39.0", features = ["colors", "glob", "yaml"]}
 insta-cmd = "0.6.0"
 itertools = "0.12.0"

diff --git a/prqlc/prqlc-ast/Cargo.toml b/prqlc/prqlc-ast/Cargo.toml
@@ -12,7 +12,7 @@ version.workspace = true
 doctest = false
 
 [dependencies]
-enum-as-inner = "0.6.0"
+enum-as-inner = {workspace = true}
 semver = {version = "1.0.23", features = ["serde"]}
 serde = {workspace = true}
 serde_yaml = {workspace = true, optional = true}

diff --git a/prqlc/prqlc-ast/src/expr.rs b/prqlc/prqlc-ast/src/expr.rs
@@ -1,6 +1,5 @@
 pub mod generic;
 mod ident;
-mod literal;
 mod ops;
 
 use std::collections::HashMap;
@@ -9,8 +8,9 @@ use enum_as_inner::EnumAsInner;
 use serde::{Deserialize, Serialize};
 
 pub use self::ident::Ident;
-pub use self::literal::{Literal, ValueAndUnit};
 pub use self::ops::{BinOp, UnOp};
+pub use self::token::{Literal, ValueAndUnit};
+use super::token;
 use crate::span::Span;
 use crate::Ty;
 
@@ -51,7 +51,7 @@ pub enum ExprKind {
         feature = "serde_yaml",
         serde(with = "serde_yaml::with::singleton_map")
     )]
-    Literal(Literal),
+    Literal(token::Literal),
     Pipeline(Pipeline),
 
     Tuple(Vec<Expr>),
@@ -153,8 +153,8 @@ pub type Range = generic::Range<Box<Expr>>;
 pub type InterpolateItem = generic::InterpolateItem<Expr>;
 pub type SwitchCase = generic::SwitchCase<Box<Expr>>;
 
-impl From<Literal> for ExprKind {
-    fn from(value: Literal) -> Self {
+impl From<token::Literal> for ExprKind {
+    fn from(value: token::Literal) -> Self {
         ExprKind::Literal(value)
     }
 }

diff --git a/prqlc/prqlc-ast/src/expr/literal.rs b/prqlc/prqlc-ast/src/expr/literal.rs
diff --git a/prqlc/prqlc-ast/src/lib.rs b/prqlc/prqlc-ast/src/lib.rs
@@ -1,9 +1,11 @@
 pub mod expr;
 pub mod span;
 pub mod stmt;
+pub mod token;
 mod types;
 
 pub use expr::*;
 pub use span::*;
 pub use stmt::*;
+pub use token::*;
 pub use types::*;
diff --git a/prqlc/prqlc-ast/src/token.rs b/prqlc/prqlc-ast/src/token.rs
@@ -0,0 +1,232 @@
+use enum_as_inner::EnumAsInner;
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, PartialEq, Serialize, Deserialize, Eq)]
+pub struct Token {
+    pub kind: TokenKind,
+    pub span: std::ops::Range<usize>,
+}
+
+#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
+pub enum TokenKind {
+    NewLine,
+
+    Ident(String),
+    Keyword(String),
+    #[cfg_attr(
+        feature = "serde_yaml",
+        serde(with = "serde_yaml::with::singleton_map")
+    )]
+    Literal(Literal),
+    Param(String),
+
+    Range {
+        /// Whether the left side of the range is bound by the previous token
+        /// (but it's not contained in this token)
+        bind_left: bool,
+        bind_right: bool,
+    },
+    Interpolation(char, String),
+
+    /// single-char control tokens
+    Control(char),
+
+    ArrowThin,   // ->
+    ArrowFat,    // =>
+    Eq,          // ==
+    Ne,          // !=
+    Gte,         // >=
+    Lte,         // <=
+    RegexSearch, // ~=
+    And,         // &&
+    Or,          // ||
+    Coalesce,    // ??
+    DivInt,      // //
+    // Pow,         // **
+    Annotate, // @
+
+    // Aesthetics only
+    Comment(String),
+    DocComment(String),
+    /// Vec containing comments between the newline and the line wrap
+    // Currently we include the comments with the LineWrap token. This isn't
+    // ideal, but I'm not sure of an easy way of having them be separate.
+    // - The line wrap span technically includes the comments — on a newline,
+    //   we need to look ahead to _after_ the comments to see if there's a
+    //   line wrap, and exclude the newline if there is.
+    // - We can only pass one token back
+    //
+    // Alternatives:
+    // - Post-process the stream, removing the newline prior to a line wrap.
+    //   But requires a whole extra pass.
+    // - Change the functionality. But it's very nice to be able to comment
+    //   something out and have line-wraps still work.
+    LineWrap(Vec<TokenKind>),
+}
+
+#[derive(Debug, EnumAsInner, PartialEq, Clone, Serialize, Deserialize, strum::AsRefStr)]
+pub enum Literal {
+    Null,
+    Integer(i64),
+    Float(f64),
+    Boolean(bool),
+    String(String),
+    Date(String),
+    Time(String),
+    Timestamp(String),
+    ValueAndUnit(ValueAndUnit),
+}
+
+impl TokenKind {
+    pub fn range(bind_left: bool, bind_right: bool) -> Self {
+        TokenKind::Range {
+            bind_left,
+            bind_right,
+        }
+    }
+}
+// Compound units, such as "2 days 3 hours" can be represented as `2days + 3hours`
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct ValueAndUnit {
+    pub n: i64,       // Do any DBs use floats or decimals for this?
+    pub unit: String, // Could be an enum IntervalType,
+}
+
+impl std::fmt::Display for Literal {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Literal::Null => write!(f, "null")?,
+            Literal::Integer(i) => write!(f, "{i}")?,
+            Literal::Float(i) => write!(f, "{i}")?,
+
+            Literal::String(s) => {
+                quote_string(s, f)?;
+            }
+
+            Literal::Boolean(b) => {
+                f.write_str(if *b { "true" } else { "false" })?;
+            }
+
+            Literal::Date(inner) | Literal::Time(inner) | Literal::Timestamp(inner) => {
+                write!(f, "@{inner}")?;
+            }
+
+            Literal::ValueAndUnit(i) => {
+                write!(f, "{}{}", i.n, i.unit)?;
+            }
+        }
+        Ok(())
+    }
+}
+
+fn quote_string(s: &str, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    let s = escape_all_except_quotes(s);
+
+    if !s.contains('"') {
+        return write!(f, r#""{s}""#);
+    }
+
+    if !s.contains('\'') {
+        return write!(f, "'{s}'");
+    }
+
+    // when string contains both single and double quotes
+    // find minimum number of double quotes
+    let mut quotes = "\"\"".to_string();
+    while s.contains(&quotes) {
+        quotes += "\"";
+    }
+    write!(f, "{quotes}{s}{quotes}")
+}
+
+fn escape_all_except_quotes(s: &str) -> String {
+    let mut result = String::new();
+    for ch in s.chars() {
+        if ch == '"' || ch == '\'' {
+            result.push(ch);
+        } else {
+            result.extend(ch.escape_default());
+        }
+    }
+    result
+}
+
+// This is here because Literal::Float(f64) does not implement Hash, so we cannot simply derive it.
+// There are reasons for that, but chumsky::Error needs Hash for the TokenKind, so it can deduplicate
+// tokens in error.
+// So this hack could lead to duplicated tokens in error messages. Oh no.
+#[allow(clippy::derived_hash_with_manual_eq)]
+impl std::hash::Hash for TokenKind {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        core::mem::discriminant(self).hash(state);
+    }
+}
+
+impl std::cmp::Eq for TokenKind {}
+
+impl std::fmt::Display for TokenKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TokenKind::NewLine => write!(f, "new line"),
+            TokenKind::Ident(s) => {
+                if s.is_empty() {
+                    // FYI this shows up in errors
+                    write!(f, "an identifier")
+                } else {
+                    write!(f, "{s}")
+                }
+            }
+            TokenKind::Keyword(s) => write!(f, "keyword {s}"),
+            TokenKind::Literal(lit) => write!(f, "{}", lit),
+            TokenKind::Control(c) => write!(f, "{c}"),
+
+            TokenKind::ArrowThin => f.write_str("->"),
+            TokenKind::ArrowFat => f.write_str("=>"),
+            TokenKind::Eq => f.write_str("=="),
+            TokenKind::Ne => f.write_str("!="),
+            TokenKind::Gte => f.write_str(">="),
+            TokenKind::Lte => f.write_str("<="),
+            TokenKind::RegexSearch => f.write_str("~="),
+            TokenKind::And => f.write_str("&&"),
+            TokenKind::Or => f.write_str("||"),
+            TokenKind::Coalesce => f.write_str("??"),
+            TokenKind::DivInt => f.write_str("//"),
+            // TokenKind::Pow => f.write_str("**"),
+            TokenKind::Annotate => f.write_str("@{"),
+
+            TokenKind::Param(id) => write!(f, "${id}"),
+
+            TokenKind::Range {
+                bind_left,
+                bind_right,
+            } => write!(
+                f,
+                "'{}..{}'",
+                if *bind_left { "" } else { " " },
+                if *bind_right { "" } else { " " }
+            ),
+            TokenKind::Interpolation(c, s) => {
+                write!(f, "{c}\"{}\"", s)
+            }
+            TokenKind::Comment(s) => {
+                writeln!(f, "#{}", s)
+            }
+            TokenKind::DocComment(s) => {
+                writeln!(f, "#!{}", s)
+            }
+            TokenKind::LineWrap(comments) => {
+                write!(f, "\n\\ ")?;
+                for comment in comments {
+                    write!(f, "{}", comment)?;
+                }
+                Ok(())
+            }
+        }
+    }
+}
+
+impl std::fmt::Debug for Token {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}..{}: {:?}", self.span.start, self.span.end, self.kind)
+    }
+}
diff --git a/prqlc/prqlc-parser/src/err/parse_error.rs b/prqlc/prqlc-parser/src/err/parse_error.rs
@@ -3,9 +3,10 @@ use std::collections::HashSet;
 use std::fmt::Display;
 use std::hash::Hash;
 
+use prqlc_ast::TokenKind;
+
 use crate::ast::Span;
 use crate::err::error::{Error, ErrorSource, Reason, WithErrorInfo};
-use crate::lexer::TokenKind;
 
 #[derive(Clone, Debug)]
 pub struct ChumError<T: Hash + Eq> {