lexer crate

2025-08-07 02:00:44 +02:00 · 2025-08-07 02:00:44 +02:00 · bc0acf7e19
parent 0fa3200b85
commit bc0acf7e19
3 changed files with 467 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,3 +1,9 @@
+[workspace]
+resolver = "3"
+members = [
+    "crates/lexer"
+]
+
 [package]
 name = "compiler"
 version = "0.1.0"
@ -16,5 +22,13 @@ paste = "1.0.15"
 petgraph = "0.6.5"
 thiserror = "1.0.63"
 unicode-xid = "0.2.4"
+tracing = "0.1.41"
+
+werkzeug = { path = "../../rust/werkzeug" }
+
+
+[workspace.dependencies]
+unicode-xid = "0.2.4"
+tracing = "0.1.41"

 werkzeug = { path = "../../rust/werkzeug" }
--- a/crates/lexer/Cargo.toml
+++ b/crates/lexer/Cargo.toml
@ -0,0 +1,9 @@
+[package]
+name = "lexer"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+tracing = { workspace = true }
+werkzeug = { workspace = true }
+unicode-xid = { workspace = true }
--- a/crates/lexer/src/lib.rs
+++ b/crates/lexer/src/lib.rs
@ -0,0 +1,444 @@
+#![feature(slice_swap_unchecked, iter_collect_into)]
+
+mod is_things {
+    /// True if `c` is considered a whitespace according to Rust language definition.
+    /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
+    /// for definitions of these classes.
+    pub fn is_whitespace(c: char) -> bool {
+        // This is Pattern_White_Space.
+        //
+        // Note that this set is stable (ie, it doesn't change with different
+        // Unicode versions), so it's ok to just hard-code the values.
+
+        matches!(
+            c,
+            // Usual ASCII suspects
+            '\u{0009}'   // \t
+        | '\u{000A}' // \n
+        | '\u{000B}' // vertical tab
+        | '\u{000C}' // form feed
+        | '\u{000D}' // \r
+        | '\u{0020}' // space
+
+    // NEXT LINE from latin1
+        | '\u{0085}'
+
+    // Bidi markers
+        | '\u{200E}' // LEFT-TO-RIGHT MARK
+        | '\u{200F}' // RIGHT-TO-LEFT MARK
+
+    // Dedicated whitespace characters from Unicode
+        | '\u{2028}' // LINE SEPARATOR
+        | '\u{2029}' // PARAGRAPH SEPARATOR
+        )
+    }
+
+    /// True if `c` is valid as a first character of an identifier.
+    /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
+    /// a formal definition of valid identifier name.
+    pub fn is_id_start(c: char) -> bool {
+        // This is XID_Start OR '_' (which formally is not a XID_Start).
+        c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
+    }
+
+    /// True if `c` is valid as a non-first character of an identifier.
+    /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
+    /// a formal definition of valid identifier name.
+    pub fn is_id_continue(c: char) -> bool {
+        unicode_xid::UnicodeXID::is_xid_continue(c)
+    }
+
+    /// The passed string is lexically an identifier.
+    pub fn is_ident(string: &str) -> bool {
+        let mut chars = string.chars();
+        if let Some(start) = chars.next() {
+            is_id_start(start) && chars.all(is_id_continue)
+        } else {
+            false
+        }
+    }
+
+    pub fn is_digit(ch: char) -> bool {
+        ('0'..='9').contains(&ch)
+    }
+
+    pub fn is_bin_digit(ch: char) -> bool {
+        ch == '0' || ch == '1'
+    }
+
+    pub fn is_nonzero_digit(ch: char) -> bool {
+        ('1'..='9').contains(&ch)
+    }
+
+    pub fn is_oct_digit(ch: char) -> bool {
+        ('0'..='7').contains(&ch)
+    }
+
+    pub fn is_hex_digit(ch: char) -> bool {
+        ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
+    }
+}
+
+macro_rules! tokens {
+    ($vis:vis $ty_name:ident:
+        {
+            $($name2:ident),*
+        },
+        {
+            $($name:ident => $lexeme:literal),*
+        }) => {
+
+            #[allow(dead_code)]
+            #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
+            $vis enum $ty_name {
+                $($name,
+                )*
+                $($name2,)*
+            }
+
+            impl ::core::fmt::Display for $ty_name {
+                fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
+                    match self {
+                        $(Self::$name => write!(f, "{}", $lexeme),)*
+                        $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
+                    }
+                }
+            }
+
+            #[allow(dead_code)]
+            impl $ty_name {
+                $vis fn lexeme(&self) -> Option<&'static str> {
+                    match self {
+                        $(Self::$name => Some($lexeme),)*
+                        $(Self::$name2 => None),*
+                    }
+                }
+
+                /// returns the number of chars in this lexeme
+                $vis fn lexeme_len(&self) -> usize {
+                    self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
+                }
+
+                $vis fn maybe_ident(&self) -> bool {
+                    self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false)
+                }
+
+                $vis fn lexemes() -> &'static [(Self, &'static str)] {
+                    &[
+                        $((Self::$name, $lexeme)),*
+                    ]
+                }
+            }
+        };
+}
+
+tokens!(pub Token: {
+    Eof,
+    ParseError,
+    // Marker Token for any Comment
+    Comment,
+    DocComment,
+        // Marker Token for any pre-processing directive
+    CharConstant,
+    IntegerConstant,
+    IntegerHexConstant,
+    IntegerBinConstant,
+    IntegerOctConstant,
+    FloatingConstant,
+    FloatingExpConstant,
+    DotFloatingConstant,
+    DotFloatingExpConstant,
+    StringConstant,
+    IntegralType,
+    Ident
+},
+    // Lexical Tokens:
+    {
+        SlashSlash => "//",
+        SlashStar => "/*",
+        //  SlashStarStar => "/**",
+        StarSlash => "*/",
+        // SlashSlashSlash => "///",
+        // Punctuation:
+        OpenParens => "(",
+        CloseParens => ")",
+        OpenBrace => "{",
+        CloseBrace => "}",
+        OpenSquareBracket => "[",
+        CloseSquareBracket => "]",
+        Semi => ";",
+        Comma => ",",
+        Elipsis3 => "...",
+        Elipsis2 => "..",
+        Colon => ":",
+        Equal => "=",
+        // Keywords:
+        Void => "void",
+        Bool => "bool",
+        F32 => "f32",
+        F64 => "f64",
+        ISize => "isize",
+        USize => "usize",
+        Const => "const",
+        Volatile => "volatile",
+        Noalias => "noalias",
+        Fn => "fn",
+        Let => "let",
+        Var => "var",
+        If => "if",
+        As => "as",
+        Else => "else",
+        Return => "return",
+        Struct => "struct",
+        Type => "type",
+        Union => "union",
+        Enum => "enum",
+        Packed => "packed",
+        Extern => "extern",
+        Pub => "pub",
+        // Operators
+        Dot => ".",
+        MinusGreater => "->",
+        Bang => "!",
+        Tilde => "~",
+        Plus => "+",
+        Minus => "-",
+        Star => "*",
+        Slash => "/",
+        Percent => "%",
+        Less => "<",
+        Greater => ">",
+        LessEqual => "<=",
+        GreaterEqual => ">=",
+        EqualEqual => "==",
+        BangEqual => "!=",
+        PipePipe => "||",
+        AmpersandAmpersand => "&&",
+        Ampersand => "&",
+        Caret => "^",
+        Pipe => "|",
+        LessLess => "<<",
+        GreaterGreater => ">>",
+        Question => "?",
+        PlusEqual => "+=",
+        MinusEqual => "-=",
+        StarEqual => "*=",
+        SlashEqual => "/=",
+        PercentEqual => "%=",
+        AmpersandEqual => "&=",
+        PipeEqual => "|=",
+        CaretEqual => "^=",
+        LessLessEqual => "<<=",
+        GreaterGreaterEqual => ">>="
+    });
+
+impl Token {
+    pub fn is_assignment_op(self) -> bool {
+        match self {
+            Token::PlusEqual
+            | Token::MinusEqual
+            | Token::StarEqual
+            | Token::SlashEqual
+            | Token::PercentEqual
+            | Token::PipeEqual
+            | Token::CaretEqual
+            | Token::AmpersandEqual
+            | Token::LessLessEqual
+            | Token::GreaterGreaterEqual
+            | Token::Equal => true,
+            _ => false,
+        }
+    }
+    pub fn is_unary_op(self) -> bool {
+        match self {
+            Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true,
+            _ => false,
+        }
+    }
+    pub fn is_binary_op(self) -> bool {
+        match self {
+            Token::Star
+            | Token::Slash
+            | Token::Percent
+            | Token::Pipe
+            | Token::Ampersand
+            | Token::Caret
+            | Token::Plus
+            | Token::Minus
+            | Token::PipePipe
+            | Token::AmpersandAmpersand
+            | Token::BangEqual
+            | Token::EqualEqual
+            | Token::Less
+            | Token::Greater
+            | Token::LessEqual
+            | Token::GreaterEqual
+            | Token::LessLess
+            | Token::GreaterGreater => true,
+            _ => false,
+        }
+    }
+}
+
+/// A list of lexemes used by the `LexemeParser`.
+/// `lexemes` contains every token that has a defined lexeme, such as `fn`, `f32`, `const`, etc.
+/// The `LexemeList` keeps track of two offsets into the `lexemes` array,
+/// splitting it into three windows:
+/// - [0, start_candidates) - tokens that are still being considered for parsing
+/// - [start_candidates, end_candidates) - the tokens which this lexeme matches
+/// - [end_candidates, len) - tokens that have been filtered out and are no longer considered
+/// On each iteration of the parsing loop, the remaining tokens are matched
+/// against the next character and, if they match completely, are swapped into
+/// the candidates window, or swapped to the end if they don't.
+struct LexemeList {
+    lexemes: Box<[Token]>,
+    start_candidates: usize,
+    end_candidates: usize,
+    filtered: Vec<(usize, FilterResult)>,
+}
+
+enum FilterResult {
+    Remove,
+    Candidate,
+}
+
+impl LexemeList {
+    fn new() -> Self {
+        let lexemes = Token::lexemes()
+            .iter()
+            .map(|(tok, _)| tok.clone())
+            .collect::<Box<_>>();
+
+        Self {
+            start_candidates: lexemes.len(),
+            end_candidates: lexemes.len(),
+            lexemes,
+            filtered: Vec::new(),
+        }
+    }
+
+    fn clear(&mut self) {
+        self.start_candidates = self.lexemes.len();
+        self.end_candidates = self.lexemes.len();
+    }
+
+    fn remaining(&self) -> &[Token] {
+        &self.lexemes[0..self.start_candidates]
+    }
+
+    fn candidates(&self) -> &[Token] {
+        &self.lexemes[self.start_candidates..self.end_candidates]
+    }
+
+    fn step(&mut self, ch: char, pos: usize) {
+        // smartly reuse allocation for `filtered`
+        // truly one of the premature optimizations.
+        // but it just feels good, innit?
+        let mut filtered = core::mem::take(&mut self.filtered);
+
+        self.remaining()
+            .iter()
+            .enumerate()
+            .filter_map(|(i, tok)| {
+                let bytes = tok.lexeme().unwrap().as_bytes();
+                // SAFETY: all tokens in `self.remaining()` are lexical tokens, and
+                // they are all valid ascii
+                let c = unsafe {
+                    // TODO: maybe keep a list of `Char<'_>`s around in order to
+                    // support fully utf8 tokens?
+                    char::from_u32_unchecked(bytes[pos] as u32)
+                };
+                match c == ch {
+                    false => Some((i, FilterResult::Remove)),
+                    true if bytes.len() <= pos + 1 => Some((i, FilterResult::Candidate)),
+                    true => None,
+                }
+            })
+            .collect_into(&mut filtered);
+
+        // iterate in reverse order so that we can safely swap elements
+        // drain here so that we can possibly reuse the `filtered` Vec allcoation
+        filtered.drain(..).rev().for_each(|(i, f)| {
+            match f {
+                // for candidates, swap the candidate with the last remaining
+                // token, then dec `start_candidates`
+                FilterResult::Candidate => {
+                    // SAFETY: we know that `i` and `self.start_candidates - 1`
+                    // are both valid indices: `self.start_candidates` starts at
+                    // the end and each time it is decremented, one more element
+                    // is removed from the front, so that as long as an element
+                    // is remaining, `self.start_candidates` is always greater
+                    // than 0.
+                    // the order of the remaining elements is not meaningfully
+                    // impacted because we only ever swap with elements after
+                    // `i`, and `i` is the greatest index we will touch.
+                    unsafe {
+                        self.lexemes.swap_unchecked(i, self.start_candidates - 1);
+                        self.start_candidates = self.start_candidates.saturating_sub(1);
+                    }
+                }
+                // for removes, swap the last candidate with the last remainign
+                // token, then swap the remove with the last candidate, then dec
+                // `end_candidates` and `start_candidates`
+                FilterResult::Remove => {
+                    unsafe {
+                        // in the case that `start_candidates` ==
+                        // `end_candidates`, no swap happens and that's fine.
+                        // remove this: v
+                        //           [a,b,c][d,e,f][g,h,i]
+                        // swap these:    ^      ^
+                        //           [a,b,f][d,e,c][g,h,i]
+                        // swap these:  ^        ^
+                        //           [a,c,f][d,e,b][g,h,i]
+                        // decrement both counters:
+                        //           [a,c][f,d,e][b,g,h,i]
+                        self.lexemes
+                            .swap_unchecked(self.start_candidates - 1, self.end_candidates - 1);
+                        self.lexemes.swap_unchecked(i, self.end_candidates - 1);
+                        self.start_candidates = self.start_candidates.saturating_sub(1);
+                        self.end_candidates = self.end_candidates.saturating_sub(1);
+                    }
+                }
+            }
+        });
+
+        // replace `filtered`
+        self.filtered = filtered;
+    }
+}
+
+/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
+/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
+/// integral types, constants or identifiers are not parsed by this.
+pub struct LexemeParser {
+    lexemes: LexemeList,
+    len: usize,
+}
+
+impl LexemeParser {
+    pub fn new() -> Self {
+        Self {
+            lexemes: LexemeList::new(),
+            len: 0,
+        }
+    }
+
+    pub fn parse(&mut self, mut tokens: impl Iterator<Item = char>) -> Option<Token> {
+        self.lexemes.clear();
+        loop {
+            let Some(ch) = tokens.next() else {
+                break;
+            };
+
+            if crate::is_things::is_whitespace(ch) {
+                break;
+            }
+
+            self.lexemes.step(ch, self.len);
+            if self.lexemes.remaining().is_empty() {
+                break;
+            }
+        }
+        self.lexemes.candidates().last().copied()
+    }
+}