use std::collections::HashMap; macro_rules! tokens { ($vis:vis $ty_name:ident: { $($name2:ident),* }, { $($name:ident => $lexeme:literal),* }) => { #[allow(dead_code)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] $vis enum $ty_name { $($name, )* $($name2,)* } impl std::fmt::Display for $ty_name { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { $(Self::$name => write!(f, "{}", $lexeme),)* $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),* } } } #[allow(dead_code)] impl $ty_name { $vis fn lexeme(&self) -> Option<&'static str> { match self { $(Self::$name => Some($lexeme),)* $(Self::$name2 => None),* } } /// returns the number of chars in this lexeme $vis fn lexeme_len(&self) -> usize { self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) } $vis fn maybe_ident(&self) -> bool { self.lexeme().map(|lexeme| crate::common::is_ident(lexeme)).unwrap_or(false) } $vis fn lexemes() -> &'static [(Self, &'static str)] { &[ $((Self::$name, $lexeme)),* ] } } }; } tokens!(pub Token: { Eof, ParseError, // Marker Token for any Comment Comment, DocComment, // Marker Token for any pre-processing directive CharConstant, IntegerConstant, IntegerHexConstant, IntegerBinConstant, IntegerOctConstant, FloatingConstant, FloatingExpConstant, DotFloatingConstant, DotFloatingExpConstant, StringConstant, IntegralType, Ident }, // Lexical Tokens: { SlashSlash => "//", SlashStar => "/*", SlashStarStar => "/**", StarSlash => "*/", SlashSlashSlash => "///", // Punctuation: OpenParens => "(", CloseParens => ")", OpenBrace => "{", CloseBrace => "}", OpenSquareBracket => "[", CloseSquareBracket => "]", Semi => ";", Comma => ",", Elipsis3 => "...", Elipsis2 => "..", Colon => ":", Equal => "=", // Keywords: Void => "void", Bool => "bool", F32 => "f32", F64 => "f64", ISize => "isize", USize => "usize", Const => "const", Volatile => "volatile", Noalias => "noalias", Fn => "fn", Let => "let", Var => "var", If => "if", As => "as", Else => "else", Return => "return", Struct => "struct", Type => "type", Union => "union", Enum => "enum", Packed => "packed", Extern => "extern", Pub => "pub", // Operators Dot => ".", MinusGreater => "->", Bang => "!", Tilde => "~", Plus => "+", Minus => "-", Star => "*", Slash => "/", Percent => "%", Less => "<", Greater => ">", LessEqual => "<=", GreaterEqual => ">=", EqualEqual => "==", BangEqual => "!=", PipePipe => "||", AmpersandAmpersand => "&&", Ampersand => "&", Caret => "^", Pipe => "|", LessLess => "<<", GreaterGreater => ">>", Question => "?", PlusEqual => "+=", MinusEqual => "-=", StarEqual => "*=", SlashEqual => "/=", PercentEqual => "%=", AmpersandEqual => "&=", PipeEqual => "|=", CaretEqual => "^=", LessLessEqual => "<<=", GreaterGreaterEqual => ">>=" }); impl Token { pub fn is_assignment_op(self) -> bool { match self { Token::PlusEqual | Token::MinusEqual | Token::StarEqual | Token::SlashEqual | Token::PercentEqual | Token::PipeEqual | Token::CaretEqual | Token::AmpersandEqual | Token::LessLessEqual | Token::GreaterGreaterEqual | Token::Equal => true, _ => false, } } pub fn is_unary_op(self) -> bool { match self { Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true, _ => false, } } pub fn is_binary_op(self) -> bool { match self { Token::Star | Token::Slash | Token::Percent | Token::Pipe | Token::Ampersand | Token::Caret | Token::Plus | Token::Minus | Token::PipePipe | Token::AmpersandAmpersand | Token::BangEqual | Token::EqualEqual | Token::Less | Token::Greater | Token::LessEqual | Token::GreaterEqual | Token::LessLess | Token::GreaterGreater => true, _ => false, } } } /// Helper type for parsing tokens that have a defined lexeme, such as `fn`, /// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive /// integral types, constants or identifiers are not parsed by this. pub struct LexemeParser { lexemes: Vec, candidates: Vec, len: usize, } impl LexemeParser { pub fn new() -> Self { let lexemes = Token::lexemes() .iter() .map(|(tok, _)| tok.clone()) .collect::>(); Self { lexemes, candidates: vec![], len: 0, } } pub fn finish(mut self) -> Option { self.candidates.pop() } pub fn parse(mut tokens: impl Iterator) -> Option { let mut this = Self::new(); loop { let Some(ch) = tokens.next() else { break; }; if crate::common::is_whitespace(ch) { break; } match this.advance(ch)? { None => {} Some(token) => { return Some(token); } } } this.finish() } /// Accepts a `char` and returns `Some(None)` until it is done trying to parse the longest lexeme. /// If no more potential matches are available, returns the longest matched token as `Some(Token)`, or `None` on failure. /// accepts a char and returns `None` until it is done trying to parse the longest `Token`. /// when finished, returns a Token, if it parsed one, or `Some(None)`. pub fn advance(&mut self, ch: char) -> Option> { self.len += 1; // advance match // keep tokens whose lexemes match the next char self.lexemes.retain(|tok| { // SAFETY: all of these tokens are lexical, and every character in // them is represented by a single byte and we know they must be // utf8/ascii. unsafe { char::from_u32_unchecked(tok.lexeme().unwrap().as_bytes()[self.len - 1] as u32) == ch } }); // A token has been successfully matched completely if it has not yet // been removed from the lexeme list but the length of it's lexeme is no // greater than the number of chars we've received. self.candidates.extend(self.lexemes.extract_if(|tok| { // SAFETY: as above, all of the tokens in self.lexemes are // lexical and are all single byte characters. tok.lexeme().unwrap().as_bytes().len() <= self.len })); // we prefer the longer match // that means that a+++++b doesn't parse and a+++(++b) is a++ + ++b // `&&i` is also LogicalAnd i and not Ampersand Ampersand i // Somehow, this is also a gnu extension... if self.lexemes.is_empty() { // return match, if it exists return match self.candidates.pop() { Some(token) => Some(Some(token)), None => None, }; } return Some(None); } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct TokenPos { pub token: Token, pub start: u32, pub end: u32, } impl TokenPos { pub fn new(token: Token, start: u32, end: u32) -> Self { Self { token, start, end } } } pub static PRECEDENCE_MAP: std::sync::LazyLock> = std::sync::LazyLock::new(|| { HashMap::from([ (Token::PipePipe, 10), (Token::AmpersandAmpersand, 20), (Token::Pipe, 30), (Token::Caret, 40), (Token::Ampersand, 50), (Token::BangEqual, 60), (Token::EqualEqual, 60), (Token::LessEqual, 70), (Token::GreaterEqual, 70), (Token::Less, 70), (Token::Greater, 70), (Token::GreaterGreater, 80), (Token::LessLess, 80), (Token::Plus, 90), (Token::Minus, 90), (Token::Percent, 100), (Token::Star, 100), (Token::Slash, 100), ]) });