#![feature(slice_swap_unchecked, iter_collect_into, push_mut)] mod is_things { /// True if `c` is considered a whitespace according to Rust language definition. /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) /// for definitions of these classes. pub fn is_whitespace(c: char) -> bool { // This is Pattern_White_Space. // // Note that this set is stable (ie, it doesn't change with different // Unicode versions), so it's ok to just hard-code the values. matches!( c, // Usual ASCII suspects '\u{0009}' // \t | '\u{000A}' // \n | '\u{000B}' // vertical tab | '\u{000C}' // form feed | '\u{000D}' // \r | '\u{0020}' // space // NEXT LINE from latin1 | '\u{0085}' // Bidi markers | '\u{200E}' // LEFT-TO-RIGHT MARK | '\u{200F}' // RIGHT-TO-LEFT MARK // Dedicated whitespace characters from Unicode | '\u{2028}' // LINE SEPARATOR | '\u{2029}' // PARAGRAPH SEPARATOR ) } /// True if `c` is valid as a first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. pub fn is_id_start(c: char) -> bool { // This is XID_Start OR '_' (which formally is not a XID_Start). c == '_' || c == '-' || unicode_xid::UnicodeXID::is_xid_start(c) } /// True if `c` is valid as a non-first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. pub fn is_id_continue(c: char) -> bool { unicode_xid::UnicodeXID::is_xid_continue(c) || c == '-' } /// The passed string is lexically an identifier. pub fn is_ident(string: &str) -> bool { let mut chars = string.chars(); if let Some(start) = chars.next() { is_id_start(start) && chars.all(is_id_continue) } else { false } } pub fn is_digit(ch: char) -> bool { ('0'..='9').contains(&ch) } pub fn is_bin_digit(ch: char) -> bool { ch == '0' || ch == '1' } #[expect(dead_code)] pub fn is_nonzero_digit(ch: char) -> bool { ('1'..='9').contains(&ch) } pub fn is_oct_digit(ch: char) -> bool { ('0'..='7').contains(&ch) } pub fn is_hex_digit(ch: char) -> bool { ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch) } } macro_rules! tokens { ($vis:vis $ty_name:ident: { $($name2:ident),* }, { $($name:ident => $lexeme:literal),* }) => { #[allow(dead_code)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] $vis enum $ty_name { $($name, )* $($name2,)* } impl ::core::fmt::Display for $ty_name { fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { match self { $(Self::$name => write!(f, "{}", $lexeme),)* $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),* } } } #[allow(dead_code)] impl $ty_name { $vis fn lexeme(&self) -> Option<&'static str> { match self { $(Self::$name => Some($lexeme),)* $(Self::$name2 => None),* } } /// returns the number of chars in this lexeme $vis fn lexeme_len(&self) -> usize { self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) } /// returns the number of chars in this lexeme $vis fn lexeme_len_utf8(&self) -> usize { self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0) } $vis fn maybe_ident(&self) -> bool { self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false) } $vis fn lexemes() -> &'static [(Self, &'static str)] { &[ $((Self::$name, $lexeme)),* ] } } }; } tokens!(pub Token: { Eof, ParseError, // Marker Token for any Comment Comment, DocComment, // Marker Token for any pre-processing directive CharConstant, IntegerConstant, IntegerHexConstant, IntegerBinConstant, IntegerOctConstant, FloatingConstant, FloatingExpConstant, DotFloatingConstant, DotFloatingExpConstant, StringConstant, Ident }, // Lexical Tokens: { SlashSlash => "//", SlashSlashSlash => "///", // SlashStar => "/*", // SlashStarStar => "/**", //StarSlash => "*/", // Punctuation: OpenParens => "(", CloseParens => ")", OpenBrace => "{", CloseBrace => "}", OpenSquareBracket => "[", CloseSquareBracket => "]", Semi => ";", Comma => ",", Elipsis3 => "...", Elipsis2 => "..", Colon => ":", Equal => "=", // Keywords: Void => "void", Bool => "bool", F32 => "f32", F64 => "f64", ISize => "isize", USize => "usize", U1 => "u1", U8 => "u8", U16 => "u16", U32 => "u32", U64 => "u64", I1 => "i1", I8 => "i8", I16 => "i16", I32 => "i32", I64 => "i64", Const => "const", Mutable => "mut", Volatile => "volatile", Noalias => "noalias", Fn => "fn", Let => "let", Var => "var", If => "if", As => "as", Else => "else", Return => "return", Struct => "struct", Type => "type", Union => "union", Enum => "enum", Packed => "packed", Extern => "extern", Pub => "pub", Module => "mod", // Operators Dot => ".", MinusGreater => "->", Bang => "!", Tilde => "~", Plus => "+", // PlusPlus => "++", Minus => "-", // MinusMinus => "--", Star => "*", Slash => "/", Percent => "%", Less => "<", Greater => ">", LessEqual => "<=", GreaterEqual => ">=", EqualEqual => "==", BangEqual => "!=", PipePipe => "||", AmpersandAmpersand => "&&", Ampersand => "&", Caret => "^", Pipe => "|", LessLess => "<<", GreaterGreater => ">>", Question => "?", PlusEqual => "+=", MinusEqual => "-=", StarEqual => "*=", SlashEqual => "/=", PercentEqual => "%=", AmpersandEqual => "&=", PipeEqual => "|=", CaretEqual => "^=", LessLessEqual => "<<=", GreaterGreaterEqual => ">>=" }); impl Token { pub fn is_assignment_op(self) -> bool { match self { Token::PlusEqual | Token::MinusEqual | Token::StarEqual | Token::SlashEqual | Token::PercentEqual | Token::PipeEqual | Token::CaretEqual | Token::AmpersandEqual | Token::LessLessEqual | Token::GreaterGreaterEqual | Token::Equal => true, _ => false, } } pub fn is_unary_op(self) -> bool { match self { Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true, _ => false, } } pub fn is_binary_op(self) -> bool { match self { Token::Star | Token::Slash | Token::Percent | Token::Pipe | Token::Ampersand | Token::Caret | Token::Plus | Token::Minus | Token::PipePipe | Token::AmpersandAmpersand | Token::BangEqual | Token::EqualEqual | Token::Less | Token::Greater | Token::LessEqual | Token::GreaterEqual | Token::LessLess | Token::GreaterGreater => true, _ => false, } } } use std::{ collections::VecDeque, marker::PhantomData, ops::{Deref, DerefMut, Range}, }; use trie::Tree; #[derive(Debug, Clone)] pub struct TokenItem<'a> { pub token: Token, pub lexeme: &'a str, pub offset: u32, } #[derive(Debug, Clone, Copy)] struct CharCountingIterator { iter: I, count: usize, } impl From for CharCountingIterator { fn from(iter: I) -> Self { Self { iter, count: 0 } } } impl> Iterator for CharCountingIterator { type Item = I::Item; fn next(&mut self) -> Option { self.iter.next().inspect(|c| self.count += c.len_utf8()) } } impl CharCountingIterator { pub(crate) fn offset(&self) -> usize { self.count } } impl core::ops::Deref for CharCountingIterator { type Target = I; fn deref(&self) -> &Self::Target { &self.iter } } impl core::ops::DerefMut for CharCountingIterator { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.iter } } type Source<'a> = CharCountingIterator>>; pub struct TokenIterator<'a> { trie: Tree, source: &'a str, offset: usize, } impl<'a> TokenIterator<'a> { pub fn new(source: &'a str) -> Self { let mut trie = Tree::new(); for (token, token_str) in Token::lexemes() { trie.insert(token_str.chars(), *token); } Self { trie, source, offset: 0, } } fn peekable_source(&self) -> Source<'a> { CharCountingIterator::from(self.source[self.offset..].chars().peekable()) } fn parse(&mut self) -> Option { let mut iter = CharCountingIterator::from(self.source[self.offset..].chars()); match self.trie.get_closest(&mut iter) { Some(token) => { // skip the peeked item self.offset += token.lexeme_len(); Some(*token) } None => None, } } fn skip_whitespaces(&mut self) -> usize { self.skip_while(is_things::is_whitespace) } fn skip(&mut self, mut n: usize) -> usize { self.skip_while(|_| { n -= 1; n > 0 }) } fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) -> usize { let mut count = 0; loop { let Some(c) = self.source[self.offset..].chars().next() else { break; }; if pred(c) { self.offset += c.len_utf8(); count += c.len_utf8(); continue; } else { break; } } count } fn next_token(&mut self) -> Option<(Token, Range)> { // skip whitespace self.skip_whitespaces(); let start = self.offset; let mut source = self.peekable_source(); let mut cursor = self.peekable_source(); let token = match cursor.next() { Some('0'..='9') => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); Some(token) } Some('.') if cursor.next().map_or(false, is_things::is_digit) => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); Some(token) } Some('\'' | '"') => { let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; self.offset += source.offset(); Some(token) } Some('`') => { // raw identifier self.skip(1); self.skip_while(|c| is_things::is_id_continue(c)); if self.peekable_source().next() == Some('`') { self.skip(1); Some(Token::Ident) } else { // unterminated raw identifier Some(Token::ParseError) } } // `//`-style comments or doc-comments _ => match self.parse().map(|tok| match tok { Token::SlashSlash => { self.skip_while(|c| c == '\n'); Token::Comment } Token::SlashSlashSlash => { self.skip_while(|c| c == '\n'); Token::DocComment } _ => tok, }) { Some(tok) => { if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { Some(Token::Ident) } else { Some(tok) } } None => { if self .peekable_source() .next() .map_or(false, |c| is_things::is_id_start(c)) { self.skip(1); self.skip_while(|c| is_things::is_id_continue(c)); Some(Token::Ident) } else { None } } }, }?; Some((token, start..self.offset)) } fn next_token_item(&mut self) -> Option> { let (token, range) = self.next_token()?; let lexeme = &self.source[range.clone()]; Some(TokenItem { token, lexeme, offset: range.start as u32, }) } pub fn into_token_items(self) -> TokenItemIterator<'a> { TokenItemIterator { inner: self } } } impl<'a> Iterator for TokenIterator<'a> { type Item = Token; fn next(&mut self) -> Option { self.next_token().map(|(token, _)| token) } } pub struct TokenItemIterator<'a> { inner: TokenIterator<'a>, } impl<'a> Iterator for TokenItemIterator<'a> { type Item = TokenItem<'a>; fn next(&mut self) -> Option { self.inner.next_token_item() } } pub struct Peeking; pub struct Consuming; pub trait ReborrowMode: sealed::Sealed {} impl ReborrowMode for Peeking {} impl ReborrowMode for Consuming {} mod sealed { pub trait Sealed {} impl Sealed for super::Peeking {} impl Sealed for super::Consuming {} } enum Queue<'a, T> { Owned(VecDeque), Borrowed(&'a mut VecDeque), } impl<'a, T> Queue<'a, T> { fn borrowed(&'_ mut self) -> Queue<'_, T> { match self { Queue::Owned(v) => Queue::Borrowed(v), Queue::Borrowed(v) => Queue::Borrowed(v), } } } impl Deref for Queue<'_, T> { type Target = VecDeque; fn deref(&self) -> &Self::Target { match self { Queue::Owned(v) => v, Queue::Borrowed(v) => v, } } } impl DerefMut for Queue<'_, T> { fn deref_mut(&mut self) -> &mut Self::Target { match self { Queue::Owned(v) => v, Queue::Borrowed(v) => v, } } } pub struct ReborrowingIterator<'a, 'b, I, T, Marker> where I: Iterator, { iter: &'a mut I, cache: Queue<'b, T>, peeking_cursor: usize, _marker: PhantomData, } pub type ReborrowingPeekingIterator<'a, 'b, I, T> = ReborrowingIterator<'a, 'b, I, T, Peeking>; pub type ReborrowingConsumingIterator<'a, 'b, I, T> = ReborrowingIterator<'a, 'b, I, T, Consuming>; impl<'a, 'b, I, T, Marker> ReborrowingIterator<'a, 'b, I, T, Marker> where I: Iterator, { pub fn new(iter: &'a mut I) -> Self { Self { iter, cache: Queue::Owned(VecDeque::new()), peeking_cursor: 0, _marker: PhantomData, } } pub fn reborrow_peeking(self) -> ReborrowingIterator<'a, 'b, I, T, Peeking> { ReborrowingIterator { iter: self.iter, cache: self.cache, peeking_cursor: 0, _marker: PhantomData, } } pub fn reborrow_consuming(self) -> ReborrowingIterator<'a, 'b, I, T, Consuming> { ReborrowingIterator { iter: self.iter, cache: self.cache, peeking_cursor: 0, _marker: PhantomData, } } pub fn borrow_peeking(&'_ mut self) -> ReborrowingIterator<'_, '_, I, T, Peeking> { ReborrowingIterator { iter: self.iter, cache: self.cache.borrowed(), peeking_cursor: 0, _marker: PhantomData, } } pub fn borrow_consuming(&'_ mut self) -> ReborrowingIterator<'_, '_, I, T, Consuming> { ReborrowingIterator { iter: self.iter, cache: self.cache.borrowed(), peeking_cursor: 0, _marker: PhantomData, } } } impl<'a, 'b, I, T> ReborrowingIterator<'a, 'b, I, T, Consuming> where I: Iterator, { pub fn expect_one_of>(&mut self, candidates: Ts) -> Option where T: Eq, { let mut candidates = candidates.into_iter(); let token = self.next()?; if candidates.any(|cand| cand == token) { Some(token) } else { None } } } impl<'a, 'b, I, T> Iterator for ReborrowingIterator<'a, 'b, I, T, Consuming> where I: Iterator, { type Item = T; fn next(&mut self) -> Option { self.cache.pop_front().or_else(|| self.iter.next()) } } impl<'a, 'b, I, T> Iterator for ReborrowingIterator<'a, 'b, I, T, Peeking> where I: Iterator, T: Copy, { type Item = T; fn next(&mut self) -> Option { self.peek_next().copied() } } impl<'a, 'b, I, T> ReborrowingIterator<'a, 'b, I, T, Peeking> where I: Iterator, { pub fn peek_next(&mut self) -> Option<&T> { if self.peeking_cursor >= self.cache.len() { if let Some(item) = self.iter.next() { self.peeking_cursor += 1; Some(self.cache.push_back_mut(item)) } else { None } } else { let item = self.cache.get(self.peeking_cursor)?; self.peeking_cursor += 1; Some(item) } } pub fn drain_peeked(&mut self) -> impl Iterator + '_ { let drained = self.cache.drain(0..self.peeking_cursor); self.peeking_cursor = 0; drained } pub fn skip(&mut self, n: usize) { let cached = self.cache.len() - self.peeking_cursor; self.peeking_cursor = self.peeking_cursor.saturating_add(n); if n > cached { // need to pull from the underlying iterator let surplus = n - cached; self.cache.extend(self.iter.take(surplus)); self.peeking_cursor += n; } } pub fn borrow_consuming_at_cursor( &'_ mut self, ) -> ReborrowingIterator<'_, '_, I, T, Consuming> { _ = self.drain_peeked(); ReborrowingIterator { iter: self.iter, cache: self.cache.borrowed(), peeking_cursor: 0, _marker: PhantomData, } } pub fn reborrow_consuming_at_cursor(mut self) -> ReborrowingIterator<'a, 'b, I, T, Consuming> { _ = self.drain_peeked(); ReborrowingIterator { iter: self.iter, cache: self.cache, peeking_cursor: 0, _marker: PhantomData, } } pub fn peek_one_of>(&mut self, candidates: Ts) -> Option<&T> where T: Eq, { let mut candidates = candidates.into_iter(); let token = self.peek_next()?; if candidates.any(|cand| &cand == token) { Some(token) } else { None } } } mod complex_tokens; #[cfg(test)] mod tests { use super::*; #[test] fn test_iterator() { let tokens = "fn let void+(+bool)"; let mut lexer = TokenIterator::new(&tokens); assert_eq!(lexer.next(), Some(Token::Fn)); assert_eq!(lexer.next(), Some(Token::Let)); assert_eq!(lexer.next(), Some(Token::Void)); assert_eq!(lexer.next(), Some(Token::Plus)); assert_eq!(lexer.next(), Some(Token::OpenParens)); assert_eq!(lexer.next(), Some(Token::Plus)); assert_eq!(lexer.next(), Some(Token::Bool)); assert_eq!(lexer.next(), Some(Token::CloseParens)); assert_eq!(lexer.next(), None); } #[test] fn idents() { let mut lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123"); assert!(lexer.all(|tok| tok == Token::Ident)); } #[test] fn ident_minus_ambiguity() { let lexer = TokenIterator::new("a-a a- - a -a --a"); let tokens = lexer.collect::>(); assert_eq!( tokens, vec![ Token::Ident, Token::Ident, Token::Minus, Token::Ident, Token::Ident, Token::Ident ] ); } #[test] fn complex_iterator() { let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; let lexer = TokenIterator::new(&tokens); let mut items = lexer .into_token_items() .map(|item| (item.token, item.lexeme)); assert_eq!(items.next(), Some((Token::Fn, "fn"))); assert_eq!(items.next(), Some((Token::Ident, "my-function"))); assert_eq!(items.next(), Some((Token::OpenParens, "("))); assert_eq!(items.next(), Some((Token::Ident, "x"))); assert_eq!(items.next(), Some((Token::Colon, ":"))); assert_eq!(items.next(), Some((Token::I32, "i32"))); assert_eq!(items.next(), Some((Token::Comma, ","))); assert_eq!(items.next(), Some((Token::Ident, "y"))); assert_eq!(items.next(), Some((Token::Colon, ":"))); assert_eq!(items.next(), Some((Token::F32, "f32"))); assert_eq!(items.next(), Some((Token::CloseParens, ")"))); assert_eq!(items.next(), Some((Token::MinusGreater, "->"))); assert_eq!(items.next(), Some((Token::F32, "f32"))); assert_eq!(items.next(), Some((Token::OpenBrace, "{"))); assert_eq!(items.next(), Some((Token::Return, "return"))); assert_eq!(items.next(), Some((Token::Ident, "x"))); assert_eq!(items.next(), Some((Token::Plus, "+"))); assert_eq!(items.next(), Some((Token::Ident, "y"))); assert_eq!(items.next(), Some((Token::Semi, ";"))); assert_eq!(items.next(), Some((Token::CloseBrace, "}"))); assert_eq!(items.next(), None); } }