more refactoring

raw identifiers, comment out -- and ++ tokens
compiler wranings
2025-09-27 18:20:52 +02:00 · 2025-09-27 17:07:50 +02:00 · 2025-09-26 14:58:39 +02:00 · 2025-09-26 14:52:53 +02:00
2 changed files with 156 additions and 238 deletions
--- a/crates/lexer/src/complex_tokens.rs
+++ b/crates/lexer/src/complex_tokens.rs
@ -18,8 +18,6 @@ pub enum Error {
    FloatingConstantInvalidTrailingType,
    #[error("Invalid token.")]
    InvalidToken,
-    #[error("Identifier starts with invalid character.")]
-    ExpectedIdStartForIdentifier,
    #[error("Unknown suffix in constant.")]
    NumericalConstantUnknownSuffix,
 }
@ -73,6 +71,8 @@ impl Radix {
            Radix::Dec => Token::IntegerConstant,
        }
    }
+
+    #[expect(dead_code)]
    pub fn from_token(token: Token) -> Option<Self> {
        match token {
            Token::IntegerHexConstant => Some(Radix::Hex),
@ -82,6 +82,8 @@ impl Radix {
            _ => None,
        }
    }
+
+    #[expect(dead_code)]
    pub fn map_digit(self, c: char) -> u8 {
        match self {
            Radix::Hex => match c {
@ -104,6 +106,8 @@ impl Radix {
            },
        }
    }
+
+    #[expect(dead_code)]
    pub fn folding_method(self) -> fn(u64, char) -> u64 {
        match self {
            Radix::Hex => {
@ -152,10 +156,10 @@ impl Radix {
    }
    pub fn is_digit(self) -> fn(char) -> bool {
        match self {
-            Radix::Hex => crate::is_things::is_hex_digit,
-            Radix::Bin => crate::is_things::is_bin_digit,
-            Radix::Oct => crate::is_things::is_oct_digit,
-            Radix::Dec => crate::is_things::is_digit,
+            Radix::Hex => is_things::is_hex_digit,
+            Radix::Bin => is_things::is_bin_digit,
+            Radix::Oct => is_things::is_oct_digit,
+            Radix::Dec => is_things::is_digit,
        }
    }
 }
@ -193,11 +197,7 @@ fn try_parse_integral_type(source: &mut Source) -> Result<Option<()>> {
        return Ok(None);
    }

-    if source
-        .take_while_ref(|&c| crate::is_things::is_digit(c))
-        .count()
-        <= 0
-    {
+    if source.take_while_ref(|&c| is_things::is_digit(c)).count() <= 0 {
        return Err(Error::IntegralTypeExpectedDigit);
    };

@ -214,7 +214,7 @@ fn try_parse_exp_part(source: &mut Source) -> Result<Option<()>> {
    if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() {
        let _sign = source.next_if(|&c| c == '-' || c == '+');
        if source
-            .take_while_ref(|&c| crate::is_things::is_digit(c))
+            .take_while_ref(|&c| is_things::is_digit(c))
            .count()
            .lt(&1)
        {
@ -300,7 +300,7 @@ pub(crate) fn parse_constant(source: &mut Source) -> Result<Token> {
    // char following a constant must not be id_continue
    if source
        .peek()
-        .map(|&c| crate::is_things::is_id_continue(c))
+        .map(|&c| is_things::is_id_continue(c))
        .unwrap_or(false)
    {
        return Err(Error::NumericalConstantUnknownSuffix);
--- a/crates/lexer/src/lib.rs
+++ b/crates/lexer/src/lib.rs
@ -58,12 +58,10 @@ mod is_things {
        }
    }

-    #[expect(dead_code)]
    pub fn is_digit(ch: char) -> bool {
        ('0'..='9').contains(&ch)
    }

-    #[expect(dead_code)]
    pub fn is_bin_digit(ch: char) -> bool {
        ch == '0' || ch == '1'
    }
@ -73,12 +71,10 @@ mod is_things {
        ('1'..='9').contains(&ch)
    }

-    #[expect(dead_code)]
    pub fn is_oct_digit(ch: char) -> bool {
        ('0'..='7').contains(&ch)
    }

-    #[expect(dead_code)]
    pub fn is_hex_digit(ch: char) -> bool {
        ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
    }
@ -188,6 +184,16 @@ tokens!(pub Token: {
        F64 => "f64",
        ISize => "isize",
        USize => "usize",
+        U1 => "u1",
+        U8 => "u8",
+        U16 => "u16",
+        U32 => "u32",
+        U64 => "u64",
+        I1 => "i1",
+        I8 => "i8",
+        I16 => "i16",
+        I32 => "i32",
+        I64 => "i64",
        Const => "const",
        Volatile => "volatile",
        Noalias => "noalias",
@ -211,9 +217,9 @@ tokens!(pub Token: {
        Bang => "!",
        Tilde => "~",
        Plus => "+",
-        PlusPlus => "++",
+        // PlusPlus => "++",
        Minus => "-",
-        MinusMinus => "--",
+        // MinusMinus => "--",
        Star => "*",
        Slash => "/",
        Percent => "%",
@ -291,185 +297,29 @@ impl Token {
    }
 }

-/// A list of lexemes used by the `LexemeParser`.
-/// `lexemes` contains every token that has a defined lexeme, such as `fn`, `f32`, `const`, etc.
-/// The `LexemeList` keeps track of two offsets into the `lexemes` array,
-/// splitting it into three windows:
-/// - [0, start_candidates) - tokens that are still being considered for parsing
-/// - [start_candidates, end_candidates) - the tokens which this lexeme matches
-/// - [end_candidates, len) - tokens that have been filtered out and are no longer considered
-/// On each iteration of the parsing loop, the remaining tokens are matched
-/// against the next character and, if they match completely, are swapped into
-/// the candidates window, or swapped to the end if they don't.
-struct LexemeList {
-    lexemes: Box<[Token]>,
-    start_candidates: usize,
-    end_candidates: usize,
-    filtered: Vec<(usize, FilterResult)>,
-}
+use std::ops::Range;

-enum FilterResult {
-    Remove,
-    Candidate,
-}
-
-impl LexemeList {
-    fn new() -> Self {
-        let lexemes = Token::lexemes()
-            .iter()
-            .map(|(tok, _)| tok.clone())
-            .collect::<Box<_>>();
-
-        Self {
-            start_candidates: lexemes.len(),
-            end_candidates: lexemes.len(),
-            lexemes,
-            filtered: Vec::new(),
-        }
-    }
-
-    fn clear(&mut self) {
-        self.start_candidates = self.lexemes.len();
-        self.end_candidates = self.lexemes.len();
-    }
-
-    fn remaining(&self) -> &[Token] {
-        &self.lexemes[0..self.start_candidates]
-    }
-
-    fn candidates(&self) -> &[Token] {
-        &self.lexemes[self.start_candidates..self.end_candidates]
-    }
-
-    fn step(&mut self, ch: char, pos: usize) {
-        // smartly reuse allocation for `filtered`
-        // truly one of the premature optimizations.
-        // but it just feels good, innit?
-        let mut filtered = core::mem::take(&mut self.filtered);
-
-        self.remaining()
-            .iter()
-            .enumerate()
-            .filter_map(|(i, tok)| {
-                let bytes = tok.lexeme().unwrap().as_bytes();
-                // SAFETY: all tokens in `self.remaining()` are lexical tokens, and
-                // they are all valid ascii
-                let c = unsafe {
-                    // TODO: maybe keep a list of `Char<'_>`s around in order to
-                    // support fully utf8 tokens?
-                    char::from_u32_unchecked(bytes[pos] as u32)
-                };
-                match c == ch {
-                    false => Some((i, FilterResult::Remove)),
-                    true if bytes.len() <= pos + 1 => Some((i, FilterResult::Candidate)),
-                    true => None,
-                }
-            })
-            .collect_into(&mut filtered);
-
-        // iterate in reverse order so that we can safely swap elements
-        // drain here so that we can possibly reuse the `filtered` Vec allcoation
-        filtered.drain(..).rev().for_each(|(i, f)| {
-            match f {
-                // for candidates, swap the candidate with the last remaining
-                // token, then dec `start_candidates`
-                FilterResult::Candidate => {
-                    // SAFETY: we know that `i` and `self.start_candidates - 1`
-                    // are both valid indices: `self.start_candidates` starts at
-                    // the end and each time it is decremented, one more element
-                    // is removed from the front, so that as long as an element
-                    // is remaining, `self.start_candidates` is always greater
-                    // than 0.
-                    // the order of the remaining elements is not meaningfully
-                    // impacted because we only ever swap with elements after
-                    // `i`, and `i` is the greatest index we will touch.
-                    unsafe {
-                        self.lexemes.swap_unchecked(i, self.start_candidates - 1);
-                        self.start_candidates = self.start_candidates.saturating_sub(1);
-                    }
-                }
-                // for removes, swap the last candidate with the last remainign
-                // token, then swap the remove with the last candidate, then dec
-                // `end_candidates` and `start_candidates`
-                FilterResult::Remove => {
-                    unsafe {
-                        // in the case that `start_candidates` ==
-                        // `end_candidates`, no swap happens and that's fine.
-                        // remove this: v
-                        //           [a,b,c][d,e,f][g,h,i]
-                        // swap these:    ^      ^
-                        //           [a,b,f][d,e,c][g,h,i]
-                        // swap these:  ^        ^
-                        //           [a,c,f][d,e,b][g,h,i]
-                        // decrement both counters:
-                        //           [a,c][f,d,e][b,g,h,i]
-                        self.lexemes
-                            .swap_unchecked(self.start_candidates - 1, self.end_candidates - 1);
-                        self.lexemes.swap_unchecked(i, self.end_candidates - 1);
-                        self.start_candidates = self.start_candidates.saturating_sub(1);
-                        self.end_candidates = self.end_candidates.saturating_sub(1);
-                    }
-                }
-            }
-        });
-
-        // replace `filtered`
-        self.filtered = filtered;
-    }
-}
-
-/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
-/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
-/// integral types, constants or identifiers are not parsed by this.
-pub struct LexemeParser {
-    lexemes: LexemeList,
-    len: usize,
-}
-
-impl LexemeParser {
-    pub fn new() -> Self {
-        Self {
-            lexemes: LexemeList::new(),
-            len: 0,
-        }
-    }
-
-    pub fn parse(&mut self, mut tokens: impl Iterator<Item = char>) -> Option<Token> {
-        self.lexemes.clear();
-        loop {
-            let Some(ch) = tokens.next() else {
-                break;
-            };
-
-            if crate::is_things::is_whitespace(ch) {
-                break;
-            }
-
-            self.lexemes.step(ch, self.len);
-            if self.lexemes.remaining().is_empty() {
-                break;
-            }
-        }
-        self.lexemes.candidates().last().copied()
-    }
-}
-
-use itertools::Itertools;
 use trie::Tree;

+pub struct TokenItem<'a> {
+    pub token: Token,
+    pub lexeme: &'a str,
+    pub offset: u32,
+}
+
 #[derive(Debug, Clone, Copy)]
-struct CountingIterator<I: Iterator> {
+struct CharCountingIterator<I: Iterator> {
    iter: I,
    count: usize,
 }

-impl<I: Iterator> From<I> for CountingIterator<I> {
+impl<I: Iterator> From<I> for CharCountingIterator<I> {
    fn from(iter: I) -> Self {
        Self { iter, count: 0 }
    }
 }

-impl<I: Iterator<Item = char>> Iterator for CountingIterator<I> {
+impl<I: Iterator<Item = char>> Iterator for CharCountingIterator<I> {
    type Item = I::Item;

    fn next(&mut self) -> Option<Self::Item> {
@ -477,13 +327,13 @@ impl<I: Iterator<Item = char>> Iterator for CountingIterator<I> {
    }
 }

-impl<I: Iterator> CountingIterator<I> {
+impl<I: Iterator> CharCountingIterator<I> {
    pub(crate) fn offset(&self) -> usize {
        self.count
    }
 }

-impl<I: Iterator> core::ops::Deref for CountingIterator<I> {
+impl<I: Iterator> core::ops::Deref for CharCountingIterator<I> {
    type Target = I;

    fn deref(&self) -> &Self::Target {
@ -491,13 +341,13 @@ impl<I: Iterator> core::ops::Deref for CountingIterator<I> {
    }
 }

-impl<I: Iterator> core::ops::DerefMut for CountingIterator<I> {
+impl<I: Iterator> core::ops::DerefMut for CharCountingIterator<I> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.iter
    }
 }

-type Source<'a> = CountingIterator<core::iter::Peekable<core::str::Chars<'a>>>;
+type Source<'a> = CharCountingIterator<core::iter::Peekable<core::str::Chars<'a>>>;

 pub struct TokenIterator<'a> {
    trie: Tree<char, Token>,
@ -521,11 +371,11 @@ impl<'a> TokenIterator<'a> {
    }

    fn peekable_source(&self) -> Source<'a> {
-        CountingIterator::from(self.source[self.offset..].chars().peekable())
+        CharCountingIterator::from(self.source[self.offset..].chars().peekable())
    }

    fn parse(&mut self) -> Option<Token> {
-        let mut iter = CountingIterator::from(self.source[self.offset..].chars());
+        let mut iter = CharCountingIterator::from(self.source[self.offset..].chars());

        match self.trie.get_closest(&mut iter) {
            Some(token) => {
@ -565,12 +415,8 @@ impl<'a> TokenIterator<'a> {
        }
        count
    }
-}

-impl<'a> Iterator for TokenIterator<'a> {
-    type Item = (Token, &'a str);
-
-    fn next(&mut self) -> Option<Self::Item> {
+    fn next_token(&mut self) -> Option<(Token, Range<usize>)> {
        // skip whitespace
        self.skip_whitespaces();

@ -583,36 +429,49 @@ impl<'a> Iterator for TokenIterator<'a> {
                let token = complex_tokens::parse_constant(&mut source).ok()?;
                self.offset += source.offset();

-                Some((token, &self.source[start..self.offset]))
+                Some(token)
            }
            Some('.') if cursor.next().map_or(false, is_things::is_digit) => {
                let token = complex_tokens::parse_constant(&mut source).ok()?;
                self.offset += source.offset();

-                Some((token, &self.source[start..self.offset]))
+                Some(token)
            }
            Some('\'' | '"') => {
                let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?;
                self.offset += source.offset();

-                Some((token, &self.source[start..self.offset]))
+                Some(token)
            }
+            Some('`') => {
+                // raw identifier
+                self.skip(1);
+                self.skip_while(|c| is_things::is_id_continue(c));
+                if self.peekable_source().next() == Some('`') {
+                    self.skip(1);
+                    Some(Token::Ident)
+                } else {
+                    // unterminated raw identifier
+                    Some(Token::ParseError)
+                }
+            }
+            // `//`-style comments or doc-comments
            _ => match self.parse().map(|tok| match tok {
                Token::SlashSlash => {
                    self.skip_while(|c| c == '\n');
-                    (Token::Comment)
+                    Token::Comment
                }
                Token::SlashSlashSlash => {
                    self.skip_while(|c| c == '\n');
-                    (Token::DocComment)
+                    Token::DocComment
                }
                _ => tok,
            }) {
                Some(tok) => {
                    if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 {
-                        Some((Token::Ident, &self.source[start..self.offset]))
+                        Some(Token::Ident)
                    } else {
-                        Some((tok, &self.source[start..self.offset]))
+                        Some(tok)
                    }
                }
                None => {
@ -623,15 +482,49 @@ impl<'a> Iterator for TokenIterator<'a> {
                    {
                        self.skip(1);
                        self.skip_while(|c| is_things::is_id_continue(c));
-                        Some((Token::Ident, &self.source[start..self.offset]))
+                        Some(Token::Ident)
                    } else {
                        None
                    }
                }
            },
-        };
+        }?;

-        token
+        Some((token, start..self.offset))
+    }
+
+    fn next_token_item(&mut self) -> Option<TokenItem<'a>> {
+        let (token, range) = self.next_token()?;
+        let lexeme = &self.source[range.clone()];
+        Some(TokenItem {
+            token,
+            lexeme,
+            offset: range.start as u32,
+        })
+    }
+
+    pub fn into_token_items(self) -> TokenItemIterator<'a> {
+        TokenItemIterator { inner: self }
+    }
+}
+
+impl<'a> Iterator for TokenIterator<'a> {
+    type Item = Token;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_token().map(|(token, _)| token)
+    }
+}
+
+pub struct TokenItemIterator<'a> {
+    inner: TokenIterator<'a>,
+}
+
+impl<'a> Iterator for TokenItemIterator<'a> {
+    type Item = TokenItem<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.inner.next_token_item()
    }
 }

@ -643,44 +536,69 @@ mod tests {

    #[test]
    fn test_iterator() {
-        let tokens = "fn let void+++(++bool)";
+        let tokens = "fn let void+(+bool)";
        let mut lexer = TokenIterator::new(&tokens);
-        assert_eq!(lexer.next(), Some((Token::Fn, "fn")));
-        assert_eq!(lexer.next(), Some((Token::Let, "let")));
-        assert_eq!(lexer.next(), Some((Token::Void, "void")));
-        assert_eq!(lexer.next(), Some((Token::PlusPlus, "++")));
-        assert_eq!(lexer.next(), Some((Token::Plus, "+")));
-        assert_eq!(lexer.next(), Some((Token::OpenParens, "(")));
-        assert_eq!(lexer.next(), Some((Token::PlusPlus, "++")));
-        assert_eq!(lexer.next(), Some((Token::Bool, "bool")));
-        assert_eq!(lexer.next(), Some((Token::CloseParens, ")")));
+        assert_eq!(lexer.next(), Some(Token::Fn));
+        assert_eq!(lexer.next(), Some(Token::Let));
+        assert_eq!(lexer.next(), Some(Token::Void));
+        assert_eq!(lexer.next(), Some(Token::Plus));
+        assert_eq!(lexer.next(), Some(Token::OpenParens));
+        assert_eq!(lexer.next(), Some(Token::Plus));
+        assert_eq!(lexer.next(), Some(Token::Bool));
+        assert_eq!(lexer.next(), Some(Token::CloseParens));
        assert_eq!(lexer.next(), None);
    }

+    #[test]
+    fn idents() {
+        let mut lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123");
+        assert!(lexer.all(|tok| tok == Token::Ident));
+    }
+
+    #[test]
+    fn ident_minus_ambiguity() {
+        let lexer = TokenIterator::new("a-a a- - a -a --a");
+        let tokens = lexer.collect::<Vec<_>>();
+        assert_eq!(
+            tokens,
+            vec![
+                Token::Ident,
+                Token::Ident,
+                Token::Minus,
+                Token::Ident,
+                Token::Ident,
+                Token::Ident
+            ]
+        );
+    }
+
    #[test]
    fn complex_iterator() {
        let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }";
-        let mut lexer = TokenIterator::new(&tokens);
-        assert_eq!(lexer.next(), Some((Token::Fn, "fn")));
-        assert_eq!(lexer.next(), Some((Token::Ident, "my-function")));
-        assert_eq!(lexer.next(), Some((Token::OpenParens, "(")));
-        assert_eq!(lexer.next(), Some((Token::Ident, "x")));
-        assert_eq!(lexer.next(), Some((Token::Colon, ":")));
-        assert_eq!(lexer.next(), Some((Token::Ident, "i32")));
-        assert_eq!(lexer.next(), Some((Token::Comma, ",")));
-        assert_eq!(lexer.next(), Some((Token::Ident, "y")));
-        assert_eq!(lexer.next(), Some((Token::Colon, ":")));
-        assert_eq!(lexer.next(), Some((Token::F32, "f32")));
-        assert_eq!(lexer.next(), Some((Token::CloseParens, ")")));
-        assert_eq!(lexer.next(), Some((Token::MinusGreater, "->")));
-        assert_eq!(lexer.next(), Some((Token::F32, "f32")));
-        assert_eq!(lexer.next(), Some((Token::OpenBrace, "{")));
-        assert_eq!(lexer.next(), Some((Token::Return, "return")));
-        assert_eq!(lexer.next(), Some((Token::Ident, "x")));
-        assert_eq!(lexer.next(), Some((Token::Plus, "+")));
-        assert_eq!(lexer.next(), Some((Token::Ident, "y")));
-        assert_eq!(lexer.next(), Some((Token::Semi, ";")));
-        assert_eq!(lexer.next(), Some((Token::CloseBrace, "}")));
-        assert_eq!(lexer.next(), None);
+        let lexer = TokenIterator::new(&tokens);
+        let mut items = lexer
+            .into_token_items()
+            .map(|item| (item.token, item.lexeme));
+        assert_eq!(items.next(), Some((Token::Fn, "fn")));
+        assert_eq!(items.next(), Some((Token::Ident, "my-function")));
+        assert_eq!(items.next(), Some((Token::OpenParens, "(")));
+        assert_eq!(items.next(), Some((Token::Ident, "x")));
+        assert_eq!(items.next(), Some((Token::Colon, ":")));
+        assert_eq!(items.next(), Some((Token::I32, "i32")));
+        assert_eq!(items.next(), Some((Token::Comma, ",")));
+        assert_eq!(items.next(), Some((Token::Ident, "y")));
+        assert_eq!(items.next(), Some((Token::Colon, ":")));
+        assert_eq!(items.next(), Some((Token::F32, "f32")));
+        assert_eq!(items.next(), Some((Token::CloseParens, ")")));
+        assert_eq!(items.next(), Some((Token::MinusGreater, "->")));
+        assert_eq!(items.next(), Some((Token::F32, "f32")));
+        assert_eq!(items.next(), Some((Token::OpenBrace, "{")));
+        assert_eq!(items.next(), Some((Token::Return, "return")));
+        assert_eq!(items.next(), Some((Token::Ident, "x")));
+        assert_eq!(items.next(), Some((Token::Plus, "+")));
+        assert_eq!(items.next(), Some((Token::Ident, "y")));
+        assert_eq!(items.next(), Some((Token::Semi, ";")));
+        assert_eq!(items.next(), Some((Token::CloseBrace, "}")));
+        assert_eq!(items.next(), None);
    }
 }
Author	SHA1	Message	Date
janis	2790bc561f	more refactoring	2025-09-27 18:20:52 +02:00
janis	6e0fed0962	raw identifiers, comment out -- and ++ tokens	2025-09-27 17:07:50 +02:00
janis	122f8ff7f1	compiler wranings	2025-09-26 14:58:39 +02:00
janis	2e6b8b0cc3	add integer types	2025-09-26 14:52:53 +02:00