initial commit

2024-08-06 20:57:19 +02:00 · 2024-08-06 20:57:19 +02:00 · e8934b8ccc
commit e8934b8ccc
8 changed files with 951 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 /target
 /Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,10 @@
 [package]
 name = "compiler"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 itertools = "0.13.0"
 log = "0.4.22"
 thiserror = "1.0.63"
 unicode-xid = "0.2.4"
--- a/grammar.bnf
+++ b/grammar.bnf
@ -0,0 +1,95 @@
 # cool language called sea:
 <id-start> ::= ...
 <id-cont> ::= ...
 <digit> ::= ...
 <digits> ::= ...
 <letter> ::= ...
 <letter_> ::= <letter> | '_'
 <digitletter_> ::= <digit> | <letter> | '_'
 <ident> ::= <id-start>
        | <ident> <id-cont>
 <program> ::= <definition>
 <definition> ::= <var-decl> | <fn-decl>
 <fn-decl> ::= fn <ident> '(' <parameter-list> ,? ')' (-> <type-name>)? <block>
 <parameter-list> ::= <parameter>
                 | <parameter-list> , <parameter>
 <parameter> ::= <ident> : <type-name>
 <block> ::= <statement>*
        | <block> <expr>
 <statement> ::= <return-statement>
            | <expr-statement>
            | <assignment-statement>
            | <var-decl> ';'
 <return-statement> ::= return <expr>? ';'
 <expr-statement> ::= <expr> ';'
 <assignment-statement> ::= <ident> <assignment-op> <expr> ';'
 <assignment-op> ::= |= | &= | ^= | /= | *= | %= | <<= | >>= | += | -= | =
 <expr> ::= <or-expr>
 <or-expr> ::= <and-expr>
           | <or-expr> || <and-expr>
 <and-expr> ::= <bitor-expr>
           | <and-expr> && <bitor-expr>
 <bitor-expr> ::= <bitxor-expr>
           | <bitor-expr> '|' <bitxor-expr>
 <bitxor-expr> ::= <bitand-expr>
           | <bitxor-expr> ^ <bitand-expr>
 <bitand-expr> ::= <equality-expr>
           | <bitand-expr> & <equality-expr>
 <equality-expr> ::= <relational-expr>
           | <equality-expr> (!= | ==) <relational-expr>
 <relational-expr> ::= <shift-expr>
           | <relational-expr> (< | > | <= | >=) <shift-expr>
 <shift-expr> ::= <add-expr>
           | <shift-expr> (<< | >>) <add-expr>
 <add-expr> ::= <mul-expr>
           | <add-expr> (+ | -) <mul-expr>
 <mul-expr> ::= <prefix-expr>
           | <mul-expr> (* | / | %) <prefix-expr>
 <prefix-expr> ::= <prefix-op> <as-expr>
 <prefix-op> ::= ! - + & *
 <as-expr> ::= <primary-expr> as <type-name>
 <primary-expr> ::= <constant>
               | <literal>
               | <ident>
               | '(' <expr> ')'
 <var-decl> ::= (let | var) <ident> (':' <type-name>)? ( = <expr> )?
 <type-name> ::= <ident>
            | <primitive-type>
            | <pointer>
 <pointer> ::= '*' 'const'? <type-name>
 <primitive-type> ::= bool
                 | <integral-type>
                 | <floating-type>
                 | void
 <integral-type> ::= ('u' | 'i') <digits>+
 <floating-type> ::= 'f'('32' | '64')
 <constant> ::= <integral-constant>
           | <floating-constant>
 <integral-constant> ::= <dec-digits><integral-type>?
                    | '0x' <hex-digits> <integral-type>?
                    | '0b' <bin-digits> <integral-type>?
                    | '0o' <oct-digits> <integral-type>?
 <floating-constant> ::= <dec-digits> <floating-type>?
                    | '.' <dec-digits> <exp-part>? <floating-type>?
                    | <dec-digits> '.' <dec-digits>? <exp-part>? <floating-type>?
 <exp-part> ::=  ('e' | 'E') ('-' | '+')? <dec-digits> 
--- a/1
+++ b/1
@ -0,0 +1 @@
 nightly
--- a/src/common.rs
+++ b/src/common.rs
@ -0,0 +1,160 @@
 #![allow(unused)]
 /// True if `c` is considered a whitespace according to Rust language definition.
 /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
 /// for definitions of these classes.
 pub fn is_whitespace(c: char) -> bool {
    // This is Pattern_White_Space.
    //
    // Note that this set is stable (ie, it doesn't change with different
    // Unicode versions), so it's ok to just hard-code the values.
    matches!(
        c,
        // Usual ASCII suspects
        '\u{0009}'   // \t
        | '\u{000A}' // \n
        | '\u{000B}' // vertical tab
        | '\u{000C}' // form feed
        | '\u{000D}' // \r
        | '\u{0020}' // space
    // NEXT LINE from latin1
        | '\u{0085}'
    // Bidi markers
        | '\u{200E}' // LEFT-TO-RIGHT MARK
        | '\u{200F}' // RIGHT-TO-LEFT MARK
    // Dedicated whitespace characters from Unicode
        | '\u{2028}' // LINE SEPARATOR
        | '\u{2029}' // PARAGRAPH SEPARATOR
    )
 }
 /// True if `c` is valid as a first character of an identifier.
 /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
 /// a formal definition of valid identifier name.
 pub fn is_id_start(c: char) -> bool {
    // This is XID_Start OR '_' (which formally is not a XID_Start).
    c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
 }
 /// True if `c` is valid as a non-first character of an identifier.
 /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
 /// a formal definition of valid identifier name.
 pub fn is_id_continue(c: char) -> bool {
    unicode_xid::UnicodeXID::is_xid_continue(c)
 }
 /// The passed string is lexically an identifier.
 pub fn is_ident(string: &str) -> bool {
    let mut chars = string.chars();
    if let Some(start) = chars.next() {
        is_id_start(start) && chars.all(is_id_continue)
    } else {
        false
    }
 }
 pub fn is_digit(ch: char) -> bool {
    ('0'..='9').contains(&ch)
 }
 pub fn is_bin_digit(ch: char) -> bool {
    ch == '0' || ch == '1'
 }
 pub fn is_nonzero_digit(ch: char) -> bool {
    ('1'..='9').contains(&ch)
 }
 pub fn is_oct_digit(ch: char) -> bool {
    ('0'..='7').contains(&ch)
 }
 pub fn is_hex_digit(ch: char) -> bool {
    ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
 }
 /// Trait for only yielding the next item in the Iterator if it tests true for some predicate
 pub trait NextIf<I>: Iterator<Item = I> + Clone {
    /// Yield next item if `pred` returns `true`.
    /// If `pred` returns `false` the Iterator is not advanced.
    #[must_use]
    fn next_if<F>(&mut self, pred: F) -> Option<I>
    where
        F: FnOnce(&Self::Item) -> bool,
    {
        let old = self.clone();
        match self.next() {
            Some(item) => {
                if pred(&item) {
                    Some(item)
                } else {
                    *self = old;
                    None
                }
            }
            None => None,
        }
    }
    /// Yield next item if `pred` returns `Some(T)`.
    /// If `pred` returns `None` the Iterator is not advanced.
    #[must_use]
    fn next_if_map<F, T>(&mut self, pred: F) -> Option<T>
    where
        F: FnOnce(Self::Item) -> Option<T>,
    {
        let old = self.clone();
        match self.next() {
            Some(item) => match pred(item) {
                None => {
                    *self = old;
                    None
                }
                some => some,
            },
            None => None,
        }
    }
 }
 impl<I, T> NextIf<I> for T where T: Iterator<Item = I> + Clone {}
 pub trait FallibleParse<I>: Iterator<Item = I> + Clone {
    /// consumes items from `self` if and only if `map` yields `Some`.
    #[must_use]
    fn try_parse<F, U>(&mut self, map: F) -> Option<U>
    where
        F: FnOnce(&mut Self) -> Option<U>,
    {
        // clone iterator and keep around
        let old = self.clone();
        match map(self) {
            Some(result) => Some(result),
            None => {
                // the map function failed, restore iterator and yield None.
                *self = old;
                None
            }
        }
    }
    #[must_use]
    fn try_parse_result<F, U, E>(&mut self, map: F) -> Result<U, E>
    where
        F: FnOnce(&mut Self) -> Result<U, E>,
    {
        // clone iterator and keep around
        let old = self.clone();
        match map(self) {
            Ok(result) => Ok(result),
            Err(e) => {
                // the map function failed, restore iterator and yield None.
                *self = old;
                Err(e)
            }
        }
    }
 }
 impl<I, T> FallibleParse<I> for T where T: Iterator<Item = I> + Clone {}
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -0,0 +1,442 @@
 use crate::tokens::Token;
 use crate::tokens::TokenPos;
 use itertools::Itertools;
 use crate::common::FallibleParse;
 use crate::common::NextIf;
 #[derive(Debug, thiserror::Error)]
 pub enum LexerError {
    #[error("{0}")]
    StringError(String),
    #[error("Exp part of floating constant had no digits.")]
    FloatingConstantExpPartNoDigit,
    #[error("Dummy Message.")]
    NumericalConstantDigitLeadingUnderscore,
    #[error("Dummy Message.")]
    NumericalConstantDigitNoDigit,
    #[error("Dummy Message.")]
    IntegralTypeExpectedDigit,
    #[error("Dummy Message.")]
    FloatingConstantInvalidTrailingType,
    #[error("Dummy Message.")]
    InvalidToken,
    #[error("Dummy Message.")]
    ExpectedIdStartForIdentifier,
 }
 pub type LexerResult<T> = core::result::Result<T, LexerError>;
 #[derive(Debug, Clone)]
 pub struct Chars<'a> {
    bytes: &'a [u8],
    offset: usize,
 }
 impl<'a> Chars<'a> {
    pub fn as_str(&self) -> &str {
        unsafe { core::str::from_utf8_unchecked(&self.bytes[self.offset..]) }
    }
    pub fn is_eof(&self) -> bool {
        self.offset >= self.bytes.len()
    }
    pub fn peek(&self) -> Option<char> {
        self.clone().next()
    }
    pub fn position(&self) -> u32 {
        self.offset() as u32
    }
    pub fn offset(&self) -> usize {
        self.offset
    }
    pub fn get_range(&self, start: u32, end: u32) -> &str {
        unsafe { core::str::from_utf8_unchecked(&self.bytes[start as usize..end as usize]) }
    }
    fn next_char(&mut self) -> Option<char> {
        let ch = self.as_str().chars().next()?;
        self.offset += ch.len_utf8();
        Some(ch)
    }
 }
 impl<'a> Iterator for Chars<'a> {
    type Item = char;
    fn next(&mut self) -> Option<Self::Item> {
        self.next_char()
    }
 }
 #[derive(Debug, Clone)]
 pub struct Tokenizer<'a> {
    source: Chars<'a>,
    tokens: Vec<TokenPos>,
 }
 macro_rules! next_or_eof {
    ($expr:expr) => {
        match $expr.next() {
            Some(c) => c,
            None => {
                return Ok(Token::Eof);
            }
        }
    };
    (?$expr:expr) => {
        match $expr.peek() {
            Some(c) => c,
            None => {
                return Ok(Token::Eof);
            }
        }
    };
 }
 macro_rules! residual {
    (ok: $expr:expr) => {
        match $expr {
            Ok(t) => t,
            Err(e) => {
                return Err(e);
            }
        }
    };
    (none: $expr:expr) => {
        match $expr {
            Ok(Some(t)) => {
                return Ok(Some(t));
            }
            Ok(val) => val,
            Err(e) => {
                return Err(e);
            }
        }
    };
    (flatten: none: $expr:expr) => {
        match $expr {
            Ok(Some(t)) => {
                return Ok(t);
            }
            Ok(val) => val,
            Err(e) => {
                return Err(e);
            }
        }
    };
    (some: $expr:expr) => {
        match $expr {
            Ok(Some(t)) => t,
            Ok(None) => {
                return Ok(None);
            }
            Err(e) => {
                return Err(e);
            }
        }
    };
 }
 impl<'a> Tokenizer<'a> {
    fn push_token(&mut self, token: Token, start: u32, end: u32) -> LexerResult<()> {
        self.tokens.push(TokenPos::new(token, start, end));
        Ok(())
    }
    pub fn next_token(&mut self) -> LexerResult<()> {
        self.source
            .take_while_ref(|&c| crate::common::is_whitespace(c))
            .count();
        let start = self.source.position();
        let token = self.source.try_parse_result(|source| {
            let a = try_parse_integral_type(source).map(|o| o.map(|_| Token::IntegralType));
            residual!(none: a);
            let mut peeking = source.clone();
            match peeking.next() {
                Some('0'..='9') => {
                    return Ok(Some(parse_constant(source)?));
                }
                Some('.') if peeking.next().map(|c| ['b', 'x', 'o'].contains(&c)) == Some(true) => {
                    return Ok(Some(parse_constant(source)?));
                }
                _ => {}
            }
            Ok(None)
        });
        if let Some(token) = token? {
            return self.push_token(token, start, self.source.position());
        }
        // lexical tokens
        let token = crate::tokens::LexemeParser::parse(self.source.clone());
        if let Some(token) = token {
            _ = self.source.advance_by(token.lexeme_len());
            match token {
                Token::SlashSlash | Token::SlashSlashSlash => {
                    _ = self.push_token(token, start, self.source.position());
                    let start = self.source.position();
                    loop {
                        // advance until either EOF or newline
                        let Some(ch) = self.source.next() else {
                            break;
                        };
                        if ch == '\n' {
                            break;
                        }
                    }
                    let end = self.source.position() - 1;
                    return self.push_token(
                        if token == Token::SlashSlash {
                            Token::Comment
                        } else {
                            Token::DocComment
                        },
                        start,
                        end,
                    );
                }
                Token::SlashStar | Token::SlashStarStar => {
                    let start = self.source.position();
                    let mut end = self.source.position();
                    let mut last = self.source.next();
                    loop {
                        // break out of loop if EOF
                        let Some(l) = last.replace(match self.source.next() {
                            Some(ch) => ch,
                            None => {
                                break;
                            }
                        }) else {
                            break;
                        };
                        // break out of loop if end of comment
                        if (l, last.unwrap()) == ('*', '/') {
                            break;
                        }
                        end = self.source.position() - 1;
                    }
                    return self.push_token(
                        if token == Token::SlashStar {
                            Token::Comment
                        } else {
                            Token::DocComment
                        },
                        start,
                        end,
                    );
                }
                _ => {}
            }
            if token.maybe_ident() {
                if self
                    .source
                    .take_while_ref(|&c| crate::common::is_id_continue(c))
                    .count()
                    .gt(&0)
                {
                    return self.push_token(Token::Ident, start, self.source.position());
                }
            }
            return self.push_token(token, start, self.source.position());
        }
        self.source
            .next_if(|&c| crate::common::is_id_start(c))
            .ok_or(LexerError::ExpectedIdStartForIdentifier)?;
        self.source
            .take_while_ref(|&c| crate::common::is_id_continue(c))
            .count();
        return self.push_token(Token::Ident, start, self.source.position());
    }
 }
 /// IntegralType <-
 ///     ( 'u' | 'i' ) DIGITS+
 fn try_parse_integral_type(source: &mut Chars) -> LexerResult<Option<()>> {
    if !source.next_if(|&c| c == 'u' || c == 'i').is_some() {
        return Ok(None);
    }
    if source
        .take_while_ref(|&c| crate::common::is_digit(c))
        .count()
        <= 0
    {
        return Err(LexerError::IntegralTypeExpectedDigit);
    };
    Ok(Some(()))
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum Radix {
    Hex,
    Bin,
    Dec,
    Oct,
 }
 impl Radix {
    /// must be called with one of `['b','x','d','o']`
    unsafe fn from_char_unchecked(c: char) -> Self {
        match c.to_ascii_lowercase() {
            'o' => Self::Oct,
            'b' => Self::Oct,
            'x' => Self::Oct,
            'd' => Self::Oct,
            _ => unreachable!(),
        }
    }
    fn from_char(c: char) -> Option<Self> {
        match c.to_ascii_lowercase() {
            'o' => Some(Self::Oct),
            'b' => Some(Self::Oct),
            'x' => Some(Self::Oct),
            'd' => Some(Self::Oct),
            _ => None,
        }
    }
    fn to_token(self) -> Token {
        match self {
            Radix::Hex => Token::IntegerHexConstant,
            Radix::Bin => Token::IntegerBinConstant,
            Radix::Oct => Token::IntegerOctConstant,
            Radix::Dec => Token::IntegerConstant,
        }
    }
    fn is_digit(self) -> fn(char) -> bool {
        match self {
            Radix::Hex => crate::common::is_hex_digit,
            Radix::Bin => crate::common::is_bin_digit,
            Radix::Oct => crate::common::is_oct_digit,
            Radix::Dec => crate::common::is_digit,
        }
    }
 }
 /// where DIGIT is defined by radix:
 /// DIGITS <-
 ///     if allow_leading_underscore: `_`* DIGIT (DIGIT|`_`)*
 ///     else: DIGIT (DIGIT|`_`)*
 fn parse_digit_part(
    source: &mut Chars,
    allow_leading_underscore: bool,
    radix: Radix,
 ) -> LexerResult<()> {
    let radix = radix.is_digit();
    if allow_leading_underscore {
        let _underscore = source.take_while_ref(|&c| c == '_').count();
    }
    let _need_digit = source.next_if(|&c| radix(c)).ok_or_else(|| {
        if source.peek() == Some('_') {
            LexerError::NumericalConstantDigitLeadingUnderscore
        } else {
            LexerError::NumericalConstantDigitNoDigit
        }
    })?;
    let _rest = source.take_while_ref(|&c| radix(c) || c == '_').count();
    Ok(())
 }
 /// returns `Err(E)` if it failed to parse.
 /// returns `Ok(None)` if no exp part was found.
 /// returns `Ok(Some(()))` if an exp part was found and parsed.
 ///
 /// EXP_PART <-
 ///     (`e`|`E`) (`-`|`+`)? DEC_DIGITS
 fn try_parse_exp_part(source: &mut Chars) -> LexerResult<Option<()>> {
    if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() {
        let _sign = source.next_if(|&c| c == '-' || c == '+');
        if source
            .take_while_ref(|&c| crate::common::is_digit(c))
            .count()
            .lt(&1)
        {
            // need digits following exp notation
            Err(LexerError::FloatingConstantExpPartNoDigit)
        } else {
            Ok(Some(()))
        }
    } else {
        Ok(None)
    }
 }
 /// CONSTANT <-
 ///    DEC_DIGITS IntegralType?
 ///    `0x` HEX_DIGITS IntegralType?
 ///    `0b` BIN_DIGITS IntegralType?
 ///    `0o` OCT_DIGITS IntegralType?
 ///    DEC_DIGITS FloatingType?
 ///    `.` DEC_DIGITS EXP_PART? FloatingType?
 ///    DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
 fn parse_constant(source: &mut Chars) -> LexerResult<Token> {
    let zero = source.next_if(|&c| c == '0').is_some();
    let radix = zero
        .then(|| source.next_if_map(|c| Radix::from_char(c)))
        .flatten();
    if let Some(radix) = radix {
        parse_digit_part(source, false, radix)?;
        if source.peek().map(|c| c == 'u' || c == 'i') == Some(true) {
            try_parse_integral_type(source)?;
        }
        return Ok(radix.to_token());
    }
    // if zero: `_`* DIGIT (DIGIT|`_`)*
    // else: DIGIT (DIGIT|`_`)*
    let _digits = parse_digit_part(source, false, Radix::Dec)?;
    if let Ok(_) = source.try_parse_result(|source| try_parse_integral_type(source)) {
        return Ok(Token::IntegerConstant);
    }
    let dot = source.next_if(|&c| c == '.').is_some();
    if dot {
        parse_digit_part(source, false, Radix::Dec)?;
    }
    // parse exp notation
    let exp = try_parse_exp_part(source)?.is_some();
    // trailing FloatingType?
    let floating = if source.next_if(|&c| c == 'f').is_some() {
        let digits = source.next_tuple::<(char, char)>();
        if !(digits == Some(('6', '4')) || digits == Some(('3', '2'))) {
            // need either f64 or f32 here!
            return Err(LexerError::FloatingConstantInvalidTrailingType);
        }
        true
    } else {
        false
    };
    let token = match (dot, exp, floating) {
        (false, false, false) => Token::IntegerConstant,
        (true, false, _) => Token::DotFloatingConstant,
        (true, true, _) => Token::DotFloatingExpConstant,
        (false, true, _) => Token::FloatingExpConstant,
        (false, _, _) => Token::FloatingConstant,
    };
    Ok(token)
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,5 @@
 #![feature(extract_if, iter_advance_by)]
 mod common;
 mod lexer;
 mod tokens;
--- a/src/tokens.rs
+++ b/src/tokens.rs
@ -0,0 +1,236 @@
 macro_rules! tokens {
    ($vis:vis $ty_name:ident:
        {
            $($name2:ident),*
        },
        {
            $($name:ident => $lexeme:literal),*
        }) => {
            #[allow(dead_code)]
            #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
            $vis enum $ty_name {
                $($name,
                )*
                $($name2,)*
            }
            impl std::fmt::Display for $ty_name {
                fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                    match self {
                        $(Self::$name => write!(f, "{}", $lexeme),)*
                        $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
                    }
                }
            }
            #[allow(dead_code)]
            impl $ty_name {
                $vis fn lexeme(&self) -> Option<&'static str> {
                    match self {
                        $(Self::$name => Some($lexeme),)*
                        $(Self::$name2 => None),*
                    }
                }
                /// returns the number of chars in this lexeme
                $vis fn lexeme_len(&self) -> usize {
                    self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
                }
                $vis fn maybe_ident(&self) -> bool {
                    self.lexeme().map(|lexeme| crate::common::is_ident(lexeme)).unwrap_or(false)
                }
                $vis fn lexemes() -> &'static [(Self, &'static str)] {
                    &[
                        $((Self::$name, $lexeme)),*
                    ]
                }
            }
        };
 }
 tokens!(pub Token: {
    Eof,
    // Marker Token for any Comment
    Comment,
    DocComment,
        // Marker Token for any pre-processing directive
    CharConstant,
    IntegerConstant,
    IntegerHexConstant,
    IntegerBinConstant,
    IntegerOctConstant,
    FloatingConstant,
    FloatingExpConstant,
    DotFloatingConstant,
    DotFloatingExpConstant,
    StringConstant,
    IntegralType,
    Ident
 },
    // Lexical Tokens:
    {
        SlashSlash => "//",
        SlashStar => "/*",
        SlashStarStar => "/**",
        StarSlash => "*/",
        SlashSlashSlash => "///",
        // Punctuation:
        OpenParens => "(",
        CloseParens => ")",
        OpenBrace => "{",
        CloseBrace => "}",
        OpenSquareBracket => "[",
        CloseSquareBracket => "]",
        Semi => ";",
        Comma => ",",
        Elipsis3 => "...",
        Elipsis2 => "..",
        Colon => ":",
        Equal => "=",
        // Keywords:
        Void => "void",
        Bool => "bool",
        F32 => "f32",
        F64 => "f64",
        Const => "const",
        Fn => "fn",
        Let => "let",
        Var => "var",
        If => "if",
        As => "as",
        Else => "else",
        Return => "return",
        // Operators
        Dot => ".",
        MinusGreater => "->",
        Bang => "!",
        Tilde => "~",
        Plus => "+",
        Minus => "-",
        Star => "*",
        Slash => "/",
        Percent => "%",
        Less => "<",
        Greater => ">",
        LessEqual => "<=",
        GreaterEqual => ">=",
        EqualEqual => "==",
        BangEqual => "!=",
        PipePipe => "||",
        AmpersandAmpersand => "&&",
        Ampersand => "&",
        Caret => "^",
        Pipe => "|",
        LessLess => "<<",
        GreaterGreater => ">>",
        Question => "?",
        PlusEqual => "+=",
        MinusEqual => "-=",
        StarEqual => "*=",
        SlashEqual => "/=",
        PercentEqual => "%=",
        AmpersandEqual => "&=",
        PipeEqual => "|=",
        CaretEqual => "^=",
        LessLessEqual => "<<=",
        GreaterGreaterEqual => ">>="
    });
 /// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
 /// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
 /// integral types, constants or identifiers are not parsed by this.
 pub struct LexemeParser {
    lexemes: Vec<Token>,
    candidates: Vec<Token>,
    len: usize,
 }
 impl LexemeParser {
    pub fn new() -> Self {
        let lexemes = Token::lexemes()
            .iter()
            .map(|(tok, _)| tok.clone())
            .collect::<Vec<_>>();
        Self {
            lexemes,
            candidates: vec![],
            len: 0,
        }
    }
    pub fn finish(mut self) -> Option<Token> {
        self.candidates.pop()
    }
    pub fn parse(mut tokens: impl Iterator<Item = char>) -> Option<Token> {
        let mut this = Self::new();
        loop {
            let Some(ch) = tokens.next() else {
                break;
            };
            if crate::common::is_whitespace(ch) {
                break;
            }
            this.advance(ch)?;
        }
        this.finish()
    }
    /// accepts a char and returns `None` until it is done trying to parse the longest `Token`.
    /// when finished, returns a Token, if it parsed one, or `Some(None)`.
    pub fn advance(&mut self, ch: char) -> Option<Option<Token>> {
        self.len += 1;
        // advance match
        // keep tokens whose lexemes match the next char
        self.lexemes.retain(|tok| {
            // SAFETY: all of these tokens are lexical, and every character in
            // them is represented by a single byte and we know they must be
            // utf8/ascii.
            unsafe {
                char::from_u32_unchecked(tok.lexeme().unwrap().as_bytes()[self.len - 1] as u32)
                    == ch
            }
        });
        // A token has been successfully matched completely if it has not yet
        // been removed from the lexeme list but the length of it's lexeme is no
        // greater than the number of chars we've received.
        self.candidates.extend(self.lexemes.extract_if(|tok| {
            // SAFETY: as above, all of the tokens in self.lexemes are
            // lexical and are all single byte characters.
            tok.lexeme().unwrap().as_bytes().len() <= self.len
        }));
        // we prefer the longer match
        // that means that a+++++b doesn't parse and a+++(++b) is a++ + ++b
        // `&&i` is also LogicalAnd i and not Ampersand Ampersand i
        // Somehow, this is also a gnu extension...
        if self.lexemes.is_empty() {
            // return match, if it exists
            return Some(self.candidates.pop());
        }
        return None;
    }
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub struct TokenPos {
    pub token: Token,
    pub start: u32,
    pub end: u32,
 }
 impl TokenPos {
    pub fn new(token: Token, start: u32, end: u32) -> Self {
        Self { token, start, end }
    }
 }