#![feature(slice_swap_unchecked, iter_collect_into)] mod is_things { /// True if `c` is considered a whitespace according to Rust language definition. /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) /// for definitions of these classes. pub fn is_whitespace(c: char) -> bool { // This is Pattern_White_Space. // // Note that this set is stable (ie, it doesn't change with different // Unicode versions), so it's ok to just hard-code the values. matches!( c, // Usual ASCII suspects '\u{0009}' // \t | '\u{000A}' // \n | '\u{000B}' // vertical tab | '\u{000C}' // form feed | '\u{000D}' // \r | '\u{0020}' // space // NEXT LINE from latin1 | '\u{0085}' // Bidi markers | '\u{200E}' // LEFT-TO-RIGHT MARK | '\u{200F}' // RIGHT-TO-LEFT MARK // Dedicated whitespace characters from Unicode | '\u{2028}' // LINE SEPARATOR | '\u{2029}' // PARAGRAPH SEPARATOR ) } /// True if `c` is valid as a first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. pub fn is_id_start(c: char) -> bool { // This is XID_Start OR '_' (which formally is not a XID_Start). c == '_' || c == '-' || unicode_xid::UnicodeXID::is_xid_start(c) } /// True if `c` is valid as a non-first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. pub fn is_id_continue(c: char) -> bool { unicode_xid::UnicodeXID::is_xid_continue(c) || c == '-' } /// The passed string is lexically an identifier. pub fn is_ident(string: &str) -> bool { let mut chars = string.chars(); if let Some(start) = chars.next() { is_id_start(start) && chars.all(is_id_continue) } else { false } } pub fn is_digit(ch: char) -> bool { ('0'..='9').contains(&ch) } pub fn is_bin_digit(ch: char) -> bool { ch == '0' || ch == '1' } #[expect(dead_code)] pub fn is_nonzero_digit(ch: char) -> bool { ('1'..='9').contains(&ch) } pub fn is_oct_digit(ch: char) -> bool { ('0'..='7').contains(&ch) } pub fn is_hex_digit(ch: char) -> bool { ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch) } } macro_rules! tokens { ($vis:vis $ty_name:ident: { $($name2:ident),* }, { $($name:ident => $lexeme:literal),* }) => { #[allow(dead_code)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] $vis enum $ty_name { $($name, )* $($name2,)* } impl ::core::fmt::Display for $ty_name { fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { match self { $(Self::$name => write!(f, "{}", $lexeme),)* $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),* } } } #[allow(dead_code)] impl $ty_name { $vis fn lexeme(&self) -> Option<&'static str> { match self { $(Self::$name => Some($lexeme),)* $(Self::$name2 => None),* } } /// returns the number of chars in this lexeme $vis fn lexeme_len(&self) -> usize { self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) } /// returns the number of chars in this lexeme $vis fn lexeme_len_utf8(&self) -> usize { self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0) } $vis fn maybe_ident(&self) -> bool { self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false) } $vis fn lexemes() -> &'static [(Self, &'static str)] { &[ $((Self::$name, $lexeme)),* ] } } }; } tokens!(pub Token: { Eof, ParseError, // Marker Token for any Comment Comment, DocComment, // Marker Token for any pre-processing directive CharConstant, IntegerConstant, IntegerHexConstant, IntegerBinConstant, IntegerOctConstant, FloatingConstant, FloatingExpConstant, DotFloatingConstant, DotFloatingExpConstant, StringConstant, Ident }, // Lexical Tokens: { SlashSlash => "//", SlashSlashSlash => "///", // SlashStar => "/*", // SlashStarStar => "/**", //StarSlash => "*/", // Punctuation: OpenParens => "(", CloseParens => ")", OpenBrace => "{", CloseBrace => "}", OpenSquareBracket => "[", CloseSquareBracket => "]", Semi => ";", Comma => ",", Elipsis3 => "...", Elipsis2 => "..", Colon => ":", Equal => "=", // Keywords: Void => "void", Bool => "bool", F32 => "f32", F64 => "f64", ISize => "isize", USize => "usize", U1 => "u1", U8 => "u8", U16 => "u16", U32 => "u32", U64 => "u64", I1 => "i1", I8 => "i8", I16 => "i16", I32 => "i32", I64 => "i64", Const => "const", Volatile => "volatile", Noalias => "noalias", Fn => "fn", Let => "let", Var => "var", If => "if", As => "as", Else => "else", Return => "return", Struct => "struct", Type => "type", Union => "union", Enum => "enum", Packed => "packed", Extern => "extern", Pub => "pub", // Operators Dot => ".", MinusGreater => "->", Bang => "!", Tilde => "~", Plus => "+", // PlusPlus => "++", Minus => "-", // MinusMinus => "--", Star => "*", Slash => "/", Percent => "%", Less => "<", Greater => ">", LessEqual => "<=", GreaterEqual => ">=", EqualEqual => "==", BangEqual => "!=", PipePipe => "||", AmpersandAmpersand => "&&", Ampersand => "&", Caret => "^", Pipe => "|", LessLess => "<<", GreaterGreater => ">>", Question => "?", PlusEqual => "+=", MinusEqual => "-=", StarEqual => "*=", SlashEqual => "/=", PercentEqual => "%=", AmpersandEqual => "&=", PipeEqual => "|=", CaretEqual => "^=", LessLessEqual => "<<=", GreaterGreaterEqual => ">>=" }); impl Token { pub fn is_assignment_op(self) -> bool { match self { Token::PlusEqual | Token::MinusEqual | Token::StarEqual | Token::SlashEqual | Token::PercentEqual | Token::PipeEqual | Token::CaretEqual | Token::AmpersandEqual | Token::LessLessEqual | Token::GreaterGreaterEqual | Token::Equal => true, _ => false, } } pub fn is_unary_op(self) -> bool { match self { Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true, _ => false, } } pub fn is_binary_op(self) -> bool { match self { Token::Star | Token::Slash | Token::Percent | Token::Pipe | Token::Ampersand | Token::Caret | Token::Plus | Token::Minus | Token::PipePipe | Token::AmpersandAmpersand | Token::BangEqual | Token::EqualEqual | Token::Less | Token::Greater | Token::LessEqual | Token::GreaterEqual | Token::LessLess | Token::GreaterGreater => true, _ => false, } } } use trie::Tree; #[derive(Debug, Clone, Copy)] struct CountingIterator { iter: I, count: usize, } impl From for CountingIterator { fn from(iter: I) -> Self { Self { iter, count: 0 } } } impl> Iterator for CountingIterator { type Item = I::Item; fn next(&mut self) -> Option { self.iter.next().inspect(|c| self.count += c.len_utf8()) } } impl CountingIterator { pub(crate) fn offset(&self) -> usize { self.count } } impl core::ops::Deref for CountingIterator { type Target = I; fn deref(&self) -> &Self::Target { &self.iter } } impl core::ops::DerefMut for CountingIterator { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.iter } } type Source<'a> = CountingIterator>>; pub struct TokenIterator<'a> { trie: Tree, source: &'a str, offset: usize, } impl<'a> TokenIterator<'a> { pub fn new(source: &'a str) -> Self { let mut trie = Tree::new(); for (token, token_str) in Token::lexemes() { trie.insert(token_str.chars(), *token); } Self { trie, source, offset: 0, } } fn peekable_source(&self) -> Source<'a> { CountingIterator::from(self.source[self.offset..].chars().peekable()) } fn parse(&mut self) -> Option { let mut iter = CountingIterator::from(self.source[self.offset..].chars()); match self.trie.get_closest(&mut iter) { Some(token) => { // skip the peeked item self.offset += token.lexeme_len(); Some(*token) } None => None, } } fn skip_whitespaces(&mut self) -> usize { self.skip_while(is_things::is_whitespace) } fn skip(&mut self, mut n: usize) -> usize { self.skip_while(|_| { n -= 1; n > 0 }) } fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) -> usize { let mut count = 0; loop { let Some(c) = self.source[self.offset..].chars().next() else { break; }; if pred(c) { self.offset += c.len_utf8(); count += c.len_utf8(); continue; } else { break; } } count } } impl<'a> Iterator for TokenIterator<'a> { type Item = (Token, &'a str); fn next(&mut self) -> Option { // skip whitespace self.skip_whitespaces(); let start = self.offset; let mut source = self.peekable_source(); let mut cursor = self.peekable_source(); let token = match cursor.next() { Some('0'..='9') => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); Some((token, &self.source[start..self.offset])) } Some('.') if cursor.next().map_or(false, is_things::is_digit) => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); Some((token, &self.source[start..self.offset])) } Some('\'' | '"') => { let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; self.offset += source.offset(); Some((token, &self.source[start..self.offset])) } Some('`') => { // raw identifier self.skip(1); self.skip_while(|c| is_things::is_id_continue(c)); if self.peekable_source().next() == Some('`') { self.skip(1); Some((Token::Ident, &self.source[start..self.offset])) } else { // unterminated raw identifier Some((Token::ParseError, &self.source[start..self.offset])) } } // `//`-style comments or doc-comments _ => match self.parse().map(|tok| match tok { Token::SlashSlash => { self.skip_while(|c| c == '\n'); Token::Comment } Token::SlashSlashSlash => { self.skip_while(|c| c == '\n'); Token::DocComment } _ => tok, }) { Some(tok) => { if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { Some((Token::Ident, &self.source[start..self.offset])) } else { Some((tok, &self.source[start..self.offset])) } } None => { if self .peekable_source() .next() .map_or(false, |c| is_things::is_id_start(c)) { self.skip(1); self.skip_while(|c| is_things::is_id_continue(c)); Some((Token::Ident, &self.source[start..self.offset])) } else { None } } }, }; token } } mod complex_tokens; #[cfg(test)] mod tests { use super::*; #[test] fn test_iterator() { let tokens = "fn let void+(+bool)"; let mut lexer = TokenIterator::new(&tokens); assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); assert_eq!(lexer.next(), Some((Token::Let, "let"))); assert_eq!(lexer.next(), Some((Token::Void, "void"))); assert_eq!(lexer.next(), Some((Token::Plus, "+"))); assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); assert_eq!(lexer.next(), Some((Token::Plus, "+"))); assert_eq!(lexer.next(), Some((Token::Bool, "bool"))); assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); assert_eq!(lexer.next(), None); } #[test] fn idents() { let lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123"); assert!(lexer.map(|(tok, _)| tok).all(|tok| tok == Token::Ident)); } #[test] fn ident_minus_ambiguity() { let lexer = TokenIterator::new("a-a a- - a -a --a"); let tokens = lexer.map(|(tok, _)| tok).collect::>(); assert_eq!( tokens, vec![ Token::Ident, Token::Ident, Token::Minus, Token::Ident, Token::Ident, Token::Ident ] ); } #[test] fn complex_iterator() { let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; let mut lexer = TokenIterator::new(&tokens); assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); assert_eq!(lexer.next(), Some((Token::Ident, "my-function"))); assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); assert_eq!(lexer.next(), Some((Token::Ident, "x"))); assert_eq!(lexer.next(), Some((Token::Colon, ":"))); assert_eq!(lexer.next(), Some((Token::I32, "i32"))); assert_eq!(lexer.next(), Some((Token::Comma, ","))); assert_eq!(lexer.next(), Some((Token::Ident, "y"))); assert_eq!(lexer.next(), Some((Token::Colon, ":"))); assert_eq!(lexer.next(), Some((Token::F32, "f32"))); assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); assert_eq!(lexer.next(), Some((Token::MinusGreater, "->"))); assert_eq!(lexer.next(), Some((Token::F32, "f32"))); assert_eq!(lexer.next(), Some((Token::OpenBrace, "{"))); assert_eq!(lexer.next(), Some((Token::Return, "return"))); assert_eq!(lexer.next(), Some((Token::Ident, "x"))); assert_eq!(lexer.next(), Some((Token::Plus, "+"))); assert_eq!(lexer.next(), Some((Token::Ident, "y"))); assert_eq!(lexer.next(), Some((Token::Semi, ";"))); assert_eq!(lexer.next(), Some((Token::CloseBrace, "}"))); assert_eq!(lexer.next(), None); } }