#![feature(slice_swap_unchecked, iter_collect_into)] mod is_things { /// True if `c` is considered a whitespace according to Rust language definition. /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) /// for definitions of these classes. pub fn is_whitespace(c: char) -> bool { // This is Pattern_White_Space. // // Note that this set is stable (ie, it doesn't change with different // Unicode versions), so it's ok to just hard-code the values. matches!( c, // Usual ASCII suspects '\u{0009}' // \t | '\u{000A}' // \n | '\u{000B}' // vertical tab | '\u{000C}' // form feed | '\u{000D}' // \r | '\u{0020}' // space // NEXT LINE from latin1 | '\u{0085}' // Bidi markers | '\u{200E}' // LEFT-TO-RIGHT MARK | '\u{200F}' // RIGHT-TO-LEFT MARK // Dedicated whitespace characters from Unicode | '\u{2028}' // LINE SEPARATOR | '\u{2029}' // PARAGRAPH SEPARATOR ) } /// True if `c` is valid as a first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. pub fn is_id_start(c: char) -> bool { // This is XID_Start OR '_' (which formally is not a XID_Start). c == '_' || c == '-' || unicode_xid::UnicodeXID::is_xid_start(c) } /// True if `c` is valid as a non-first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. pub fn is_id_continue(c: char) -> bool { unicode_xid::UnicodeXID::is_xid_continue(c) || c == '-' } /// The passed string is lexically an identifier. pub fn is_ident(string: &str) -> bool { let mut chars = string.chars(); if let Some(start) = chars.next() { is_id_start(start) && chars.all(is_id_continue) } else { false } } #[expect(dead_code)] pub fn is_digit(ch: char) -> bool { ('0'..='9').contains(&ch) } #[expect(dead_code)] pub fn is_bin_digit(ch: char) -> bool { ch == '0' || ch == '1' } #[expect(dead_code)] pub fn is_nonzero_digit(ch: char) -> bool { ('1'..='9').contains(&ch) } #[expect(dead_code)] pub fn is_oct_digit(ch: char) -> bool { ('0'..='7').contains(&ch) } #[expect(dead_code)] pub fn is_hex_digit(ch: char) -> bool { ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch) } } macro_rules! tokens { ($vis:vis $ty_name:ident: { $($name2:ident),* }, { $($name:ident => $lexeme:literal),* }) => { #[allow(dead_code)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] $vis enum $ty_name { $($name, )* $($name2,)* } impl ::core::fmt::Display for $ty_name { fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { match self { $(Self::$name => write!(f, "{}", $lexeme),)* $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),* } } } #[allow(dead_code)] impl $ty_name { $vis fn lexeme(&self) -> Option<&'static str> { match self { $(Self::$name => Some($lexeme),)* $(Self::$name2 => None),* } } /// returns the number of chars in this lexeme $vis fn lexeme_len(&self) -> usize { self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) } /// returns the number of chars in this lexeme $vis fn lexeme_len_utf8(&self) -> usize { self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0) } $vis fn maybe_ident(&self) -> bool { self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false) } $vis fn lexemes() -> &'static [(Self, &'static str)] { &[ $((Self::$name, $lexeme)),* ] } } }; } tokens!(pub Token: { Eof, ParseError, // Marker Token for any Comment Comment, DocComment, // Marker Token for any pre-processing directive CharConstant, IntegerConstant, IntegerHexConstant, IntegerBinConstant, IntegerOctConstant, FloatingConstant, FloatingExpConstant, DotFloatingConstant, DotFloatingExpConstant, StringConstant, Ident }, // Lexical Tokens: { SlashSlash => "//", SlashSlashSlash => "///", // SlashStar => "/*", // SlashStarStar => "/**", //StarSlash => "*/", // Punctuation: OpenParens => "(", CloseParens => ")", OpenBrace => "{", CloseBrace => "}", OpenSquareBracket => "[", CloseSquareBracket => "]", Semi => ";", Comma => ",", Elipsis3 => "...", Elipsis2 => "..", Colon => ":", Equal => "=", // Keywords: Void => "void", Bool => "bool", F32 => "f32", F64 => "f64", ISize => "isize", USize => "usize", Const => "const", Volatile => "volatile", Noalias => "noalias", Fn => "fn", Let => "let", Var => "var", If => "if", As => "as", Else => "else", Return => "return", Struct => "struct", Type => "type", Union => "union", Enum => "enum", Packed => "packed", Extern => "extern", Pub => "pub", // Operators Dot => ".", MinusGreater => "->", Bang => "!", Tilde => "~", Plus => "+", PlusPlus => "++", Minus => "-", MinusMinus => "--", Star => "*", Slash => "/", Percent => "%", Less => "<", Greater => ">", LessEqual => "<=", GreaterEqual => ">=", EqualEqual => "==", BangEqual => "!=", PipePipe => "||", AmpersandAmpersand => "&&", Ampersand => "&", Caret => "^", Pipe => "|", LessLess => "<<", GreaterGreater => ">>", Question => "?", PlusEqual => "+=", MinusEqual => "-=", StarEqual => "*=", SlashEqual => "/=", PercentEqual => "%=", AmpersandEqual => "&=", PipeEqual => "|=", CaretEqual => "^=", LessLessEqual => "<<=", GreaterGreaterEqual => ">>=" }); impl Token { pub fn is_assignment_op(self) -> bool { match self { Token::PlusEqual | Token::MinusEqual | Token::StarEqual | Token::SlashEqual | Token::PercentEqual | Token::PipeEqual | Token::CaretEqual | Token::AmpersandEqual | Token::LessLessEqual | Token::GreaterGreaterEqual | Token::Equal => true, _ => false, } } pub fn is_unary_op(self) -> bool { match self { Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true, _ => false, } } pub fn is_binary_op(self) -> bool { match self { Token::Star | Token::Slash | Token::Percent | Token::Pipe | Token::Ampersand | Token::Caret | Token::Plus | Token::Minus | Token::PipePipe | Token::AmpersandAmpersand | Token::BangEqual | Token::EqualEqual | Token::Less | Token::Greater | Token::LessEqual | Token::GreaterEqual | Token::LessLess | Token::GreaterGreater => true, _ => false, } } } /// A list of lexemes used by the `LexemeParser`. /// `lexemes` contains every token that has a defined lexeme, such as `fn`, `f32`, `const`, etc. /// The `LexemeList` keeps track of two offsets into the `lexemes` array, /// splitting it into three windows: /// - [0, start_candidates) - tokens that are still being considered for parsing /// - [start_candidates, end_candidates) - the tokens which this lexeme matches /// - [end_candidates, len) - tokens that have been filtered out and are no longer considered /// On each iteration of the parsing loop, the remaining tokens are matched /// against the next character and, if they match completely, are swapped into /// the candidates window, or swapped to the end if they don't. struct LexemeList { lexemes: Box<[Token]>, start_candidates: usize, end_candidates: usize, filtered: Vec<(usize, FilterResult)>, } enum FilterResult { Remove, Candidate, } impl LexemeList { fn new() -> Self { let lexemes = Token::lexemes() .iter() .map(|(tok, _)| tok.clone()) .collect::>(); Self { start_candidates: lexemes.len(), end_candidates: lexemes.len(), lexemes, filtered: Vec::new(), } } fn clear(&mut self) { self.start_candidates = self.lexemes.len(); self.end_candidates = self.lexemes.len(); } fn remaining(&self) -> &[Token] { &self.lexemes[0..self.start_candidates] } fn candidates(&self) -> &[Token] { &self.lexemes[self.start_candidates..self.end_candidates] } fn step(&mut self, ch: char, pos: usize) { // smartly reuse allocation for `filtered` // truly one of the premature optimizations. // but it just feels good, innit? let mut filtered = core::mem::take(&mut self.filtered); self.remaining() .iter() .enumerate() .filter_map(|(i, tok)| { let bytes = tok.lexeme().unwrap().as_bytes(); // SAFETY: all tokens in `self.remaining()` are lexical tokens, and // they are all valid ascii let c = unsafe { // TODO: maybe keep a list of `Char<'_>`s around in order to // support fully utf8 tokens? char::from_u32_unchecked(bytes[pos] as u32) }; match c == ch { false => Some((i, FilterResult::Remove)), true if bytes.len() <= pos + 1 => Some((i, FilterResult::Candidate)), true => None, } }) .collect_into(&mut filtered); // iterate in reverse order so that we can safely swap elements // drain here so that we can possibly reuse the `filtered` Vec allcoation filtered.drain(..).rev().for_each(|(i, f)| { match f { // for candidates, swap the candidate with the last remaining // token, then dec `start_candidates` FilterResult::Candidate => { // SAFETY: we know that `i` and `self.start_candidates - 1` // are both valid indices: `self.start_candidates` starts at // the end and each time it is decremented, one more element // is removed from the front, so that as long as an element // is remaining, `self.start_candidates` is always greater // than 0. // the order of the remaining elements is not meaningfully // impacted because we only ever swap with elements after // `i`, and `i` is the greatest index we will touch. unsafe { self.lexemes.swap_unchecked(i, self.start_candidates - 1); self.start_candidates = self.start_candidates.saturating_sub(1); } } // for removes, swap the last candidate with the last remainign // token, then swap the remove with the last candidate, then dec // `end_candidates` and `start_candidates` FilterResult::Remove => { unsafe { // in the case that `start_candidates` == // `end_candidates`, no swap happens and that's fine. // remove this: v // [a,b,c][d,e,f][g,h,i] // swap these: ^ ^ // [a,b,f][d,e,c][g,h,i] // swap these: ^ ^ // [a,c,f][d,e,b][g,h,i] // decrement both counters: // [a,c][f,d,e][b,g,h,i] self.lexemes .swap_unchecked(self.start_candidates - 1, self.end_candidates - 1); self.lexemes.swap_unchecked(i, self.end_candidates - 1); self.start_candidates = self.start_candidates.saturating_sub(1); self.end_candidates = self.end_candidates.saturating_sub(1); } } } }); // replace `filtered` self.filtered = filtered; } } /// Helper type for parsing tokens that have a defined lexeme, such as `fn`, /// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive /// integral types, constants or identifiers are not parsed by this. pub struct LexemeParser { lexemes: LexemeList, len: usize, } impl LexemeParser { pub fn new() -> Self { Self { lexemes: LexemeList::new(), len: 0, } } pub fn parse(&mut self, mut tokens: impl Iterator) -> Option { self.lexemes.clear(); loop { let Some(ch) = tokens.next() else { break; }; if crate::is_things::is_whitespace(ch) { break; } self.lexemes.step(ch, self.len); if self.lexemes.remaining().is_empty() { break; } } self.lexemes.candidates().last().copied() } } use itertools::Itertools; use trie::Tree; #[derive(Debug, Clone, Copy)] struct CountingIterator { iter: I, count: usize, } impl From for CountingIterator { fn from(iter: I) -> Self { Self { iter, count: 0 } } } impl> Iterator for CountingIterator { type Item = I::Item; fn next(&mut self) -> Option { self.iter.next().inspect(|c| self.count += c.len_utf8()) } } impl CountingIterator { pub(crate) fn offset(&self) -> usize { self.count } } impl core::ops::Deref for CountingIterator { type Target = I; fn deref(&self) -> &Self::Target { &self.iter } } impl core::ops::DerefMut for CountingIterator { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.iter } } type Source<'a> = CountingIterator>>; pub struct TokenIterator<'a> { trie: Tree, source: &'a str, offset: usize, } impl<'a> TokenIterator<'a> { pub fn new(source: &'a str) -> Self { let mut trie = Tree::new(); for (token, token_str) in Token::lexemes() { trie.insert(token_str.chars(), *token); } Self { trie, source, offset: 0, } } fn peekable_source(&self) -> Source<'a> { CountingIterator::from(self.source[self.offset..].chars().peekable()) } fn parse(&mut self) -> Option { let mut iter = CountingIterator::from(self.source[self.offset..].chars()); match self.trie.get_closest(&mut iter) { Some(token) => { // skip the peeked item self.offset += token.lexeme_len(); Some(*token) } None => None, } } fn skip_whitespaces(&mut self) -> usize { self.skip_while(is_things::is_whitespace) } fn skip(&mut self, mut n: usize) -> usize { self.skip_while(|_| { n -= 1; n > 0 }) } fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) -> usize { let mut count = 0; loop { let Some(c) = self.source[self.offset..].chars().next() else { break; }; if pred(c) { self.offset += c.len_utf8(); count += c.len_utf8(); continue; } else { break; } } count } } impl<'a> Iterator for TokenIterator<'a> { type Item = (Token, &'a str); fn next(&mut self) -> Option { // skip whitespace self.skip_whitespaces(); let start = self.offset; let mut source = self.peekable_source(); let mut cursor = self.peekable_source(); let token = match cursor.next() { Some('0'..='9') => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); Some((token, &self.source[start..self.offset])) } Some('.') if cursor.next().map_or(false, is_things::is_digit) => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); Some((token, &self.source[start..self.offset])) } Some('\'' | '"') => { let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; self.offset += source.offset(); Some((token, &self.source[start..self.offset])) } _ => match self.parse().map(|tok| match tok { Token::SlashSlash => { self.skip_while(|c| c == '\n'); (Token::Comment) } Token::SlashSlashSlash => { self.skip_while(|c| c == '\n'); (Token::DocComment) } _ => tok, }) { Some(tok) => { if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { Some((Token::Ident, &self.source[start..self.offset])) } else { Some((tok, &self.source[start..self.offset])) } } None => { if self .peekable_source() .next() .map_or(false, |c| is_things::is_id_start(c)) { self.skip(1); self.skip_while(|c| is_things::is_id_continue(c)); Some((Token::Ident, &self.source[start..self.offset])) } else { None } } }, }; token } } mod complex_tokens; #[cfg(test)] mod tests { use super::*; #[test] fn test_iterator() { let tokens = "fn let void+++(++bool)"; let mut lexer = TokenIterator::new(&tokens); assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); assert_eq!(lexer.next(), Some((Token::Let, "let"))); assert_eq!(lexer.next(), Some((Token::Void, "void"))); assert_eq!(lexer.next(), Some((Token::PlusPlus, "++"))); assert_eq!(lexer.next(), Some((Token::Plus, "+"))); assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); assert_eq!(lexer.next(), Some((Token::PlusPlus, "++"))); assert_eq!(lexer.next(), Some((Token::Bool, "bool"))); assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); assert_eq!(lexer.next(), None); } #[test] fn complex_iterator() { let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; let mut lexer = TokenIterator::new(&tokens); assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); assert_eq!(lexer.next(), Some((Token::Ident, "my-function"))); assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); assert_eq!(lexer.next(), Some((Token::Ident, "x"))); assert_eq!(lexer.next(), Some((Token::Colon, ":"))); assert_eq!(lexer.next(), Some((Token::Ident, "i32"))); assert_eq!(lexer.next(), Some((Token::Comma, ","))); assert_eq!(lexer.next(), Some((Token::Ident, "y"))); assert_eq!(lexer.next(), Some((Token::Colon, ":"))); assert_eq!(lexer.next(), Some((Token::F32, "f32"))); assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); assert_eq!(lexer.next(), Some((Token::MinusGreater, "->"))); assert_eq!(lexer.next(), Some((Token::F32, "f32"))); assert_eq!(lexer.next(), Some((Token::OpenBrace, "{"))); assert_eq!(lexer.next(), Some((Token::Return, "return"))); assert_eq!(lexer.next(), Some((Token::Ident, "x"))); assert_eq!(lexer.next(), Some((Token::Plus, "+"))); assert_eq!(lexer.next(), Some((Token::Ident, "y"))); assert_eq!(lexer.next(), Some((Token::Semi, ";"))); assert_eq!(lexer.next(), Some((Token::CloseBrace, "}"))); assert_eq!(lexer.next(), None); } }