SeaLang/src/tokens.rs

macro_rules! tokens {
    ($vis:vis $ty_name:ident:
        {
            $($name2:ident),*
        },
        {
            $($name:ident => $lexeme:literal),*
        }) => {

            #[allow(dead_code)]
            #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
            $vis enum $ty_name {
                $($name,
                )*
                $($name2,)*
            }

            impl std::fmt::Display for $ty_name {
                fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                    match self {
                        $(Self::$name => write!(f, "{}", $lexeme),)*
                        $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
                    }
                }
            }

            #[allow(dead_code)]
            impl $ty_name {
                $vis fn lexeme(&self) -> Option<&'static str> {
                    match self {
                        $(Self::$name => Some($lexeme),)*
                        $(Self::$name2 => None),*
                    }
                }

                /// returns the number of chars in this lexeme
                $vis fn lexeme_len(&self) -> usize {
                    self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
                }

                $vis fn maybe_ident(&self) -> bool {
                    self.lexeme().map(|lexeme| crate::common::is_ident(lexeme)).unwrap_or(false)
                }

                $vis fn lexemes() -> &'static [(Self, &'static str)] {
                    &[
                        $((Self::$name, $lexeme)),*
                    ]
                }
            }
        };
}

tokens!(pub Token: {
    Eof,
    ParseError,
    // Marker Token for any Comment
    Comment,
    DocComment,
        // Marker Token for any pre-processing directive
    CharConstant,
    IntegerConstant,
    IntegerHexConstant,
    IntegerBinConstant,
    IntegerOctConstant,
    FloatingConstant,
    FloatingExpConstant,
    DotFloatingConstant,
    DotFloatingExpConstant,
    StringConstant,
    IntegralType,
    Ident
},
    // Lexical Tokens:
    {
        SlashSlash => "//",
        SlashStar => "/*",
        SlashStarStar => "/**",
        StarSlash => "*/",
        SlashSlashSlash => "///",
        // Punctuation:
        OpenParens => "(",
        CloseParens => ")",
        OpenBrace => "{",
        CloseBrace => "}",
        OpenSquareBracket => "[",
        CloseSquareBracket => "]",
        Semi => ";",
        Comma => ",",
        Elipsis3 => "...",
        Elipsis2 => "..",
        Colon => ":",
        Equal => "=",
        // Keywords:
        Void => "void",
        Bool => "bool",
        F32 => "f32",
        F64 => "f64",
        Const => "const",
        Fn => "fn",
        Let => "let",
        Var => "var",
        If => "if",
        As => "as",
        Else => "else",
        Return => "return",
        // Operators
        Dot => ".",
        MinusGreater => "->",
        Bang => "!",
        Tilde => "~",
        Plus => "+",
        Minus => "-",
        Star => "*",
        Slash => "/",
        Percent => "%",
        Less => "<",
        Greater => ">",
        LessEqual => "<=",
        GreaterEqual => ">=",
        EqualEqual => "==",
        BangEqual => "!=",
        PipePipe => "||",
        AmpersandAmpersand => "&&",
        Ampersand => "&",
        Caret => "^",
        Pipe => "|",
        LessLess => "<<",
        GreaterGreater => ">>",
        Question => "?",
        PlusEqual => "+=",
        MinusEqual => "-=",
        StarEqual => "*=",
        SlashEqual => "/=",
        PercentEqual => "%=",
        AmpersandEqual => "&=",
        PipeEqual => "|=",
        CaretEqual => "^=",
        LessLessEqual => "<<=",
        GreaterGreaterEqual => ">>="
    });

impl Token {
    pub fn is_assignment_op(self) -> bool {
        match self {
            Token::PlusEqual
            | Token::MinusEqual
            | Token::StarEqual
            | Token::SlashEqual
            | Token::PercentEqual
            | Token::PipeEqual
            | Token::CaretEqual
            | Token::AmpersandEqual
            | Token::LessLessEqual
            | Token::GreaterGreaterEqual
            | Token::Equal => true,
            _ => false,
        }
    }
    pub fn is_unary_op(self) -> bool {
        match self {
            Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true,
            _ => false,
        }
    }
    pub fn is_binary_op(self) -> bool {
        match self {
            Token::Star
            | Token::Slash
            | Token::Percent
            | Token::Pipe
            | Token::Ampersand
            | Token::Caret
            | Token::Plus
            | Token::Minus
            | Token::PipePipe
            | Token::AmpersandAmpersand
            | Token::BangEqual
            | Token::EqualEqual
            | Token::Less
            | Token::Greater
            | Token::LessEqual
            | Token::GreaterEqual
            | Token::LessLess
            | Token::GreaterGreater => true,
            _ => false,
        }
    }
}

/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
/// integral types, constants or identifiers are not parsed by this.
pub struct LexemeParser {
    lexemes: Vec<Token>,
    candidates: Vec<Token>,
    len: usize,
}

impl LexemeParser {
    pub fn new() -> Self {
        let lexemes = Token::lexemes()
            .iter()
            .map(|(tok, _)| tok.clone())
            .collect::<Vec<_>>();

        Self {
            lexemes,
            candidates: vec![],
            len: 0,
        }
    }

    pub fn finish(mut self) -> Option<Token> {
        self.candidates.pop()
    }

    pub fn parse(mut tokens: impl Iterator<Item = char>) -> Option<Token> {
        let mut this = Self::new();
        loop {
            let Some(ch) = tokens.next() else {
                break;
            };

            if crate::common::is_whitespace(ch) {
                break;
            }

            match this.advance(ch)? {
                None => {}
                Some(token) => {
                    return Some(token);
                }
            }
        }
        this.finish()
    }

    /// Accepts a `char` and returns `Some(None)` until it is done trying to parse the longest lexeme.
    /// If no more potential matches are available, returns the longest matched token as `Some(Token)`, or `None` on failure.

    /// accepts a char and returns `None` until it is done trying to parse the longest `Token`.
    /// when finished, returns a Token, if it parsed one, or `Some(None)`.
    pub fn advance(&mut self, ch: char) -> Option<Option<Token>> {
        self.len += 1;

        // advance match
        // keep tokens whose lexemes match the next char
        self.lexemes.retain(|tok| {
            // SAFETY: all of these tokens are lexical, and every character in
            // them is represented by a single byte and we know they must be
            // utf8/ascii.
            unsafe {
                char::from_u32_unchecked(tok.lexeme().unwrap().as_bytes()[self.len - 1] as u32)
                    == ch
            }
        });

        // A token has been successfully matched completely if it has not yet
        // been removed from the lexeme list but the length of it's lexeme is no
        // greater than the number of chars we've received.
        self.candidates.extend(self.lexemes.extract_if(|tok| {
            // SAFETY: as above, all of the tokens in self.lexemes are
            // lexical and are all single byte characters.
            tok.lexeme().unwrap().as_bytes().len() <= self.len
        }));

        // we prefer the longer match
        // that means that a+++++b doesn't parse and a+++(++b) is a++ + ++b
        // `&&i` is also LogicalAnd i and not Ampersand Ampersand i
        // Somehow, this is also a gnu extension...

        if self.lexemes.is_empty() {
            // return match, if it exists
            return match self.candidates.pop() {
                Some(token) => Some(Some(token)),
                None => None,
            };
        }

        return Some(None);
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct TokenPos {
    pub token: Token,
    pub start: u32,
    pub end: u32,
}

impl TokenPos {
    pub fn new(token: Token, start: u32, end: u32) -> Self {
        Self { token, start, end }
    }
}