From 2790bc561f116bccb41a5686d487077114b18b8b Mon Sep 17 00:00:00 2001 From: janis Date: Sat, 27 Sep 2025 18:20:52 +0200 Subject: [PATCH] more refactoring --- crates/lexer/src/lib.rs | 155 +++++++++++++++++++++++++--------------- 1 file changed, 98 insertions(+), 57 deletions(-) diff --git a/crates/lexer/src/lib.rs b/crates/lexer/src/lib.rs index ffcf151..12df766 100644 --- a/crates/lexer/src/lib.rs +++ b/crates/lexer/src/lib.rs @@ -297,21 +297,29 @@ impl Token { } } +use std::ops::Range; + use trie::Tree; +pub struct TokenItem<'a> { + pub token: Token, + pub lexeme: &'a str, + pub offset: u32, +} + #[derive(Debug, Clone, Copy)] -struct CountingIterator { +struct CharCountingIterator { iter: I, count: usize, } -impl From for CountingIterator { +impl From for CharCountingIterator { fn from(iter: I) -> Self { Self { iter, count: 0 } } } -impl> Iterator for CountingIterator { +impl> Iterator for CharCountingIterator { type Item = I::Item; fn next(&mut self) -> Option { @@ -319,13 +327,13 @@ impl> Iterator for CountingIterator { } } -impl CountingIterator { +impl CharCountingIterator { pub(crate) fn offset(&self) -> usize { self.count } } -impl core::ops::Deref for CountingIterator { +impl core::ops::Deref for CharCountingIterator { type Target = I; fn deref(&self) -> &Self::Target { @@ -333,13 +341,13 @@ impl core::ops::Deref for CountingIterator { } } -impl core::ops::DerefMut for CountingIterator { +impl core::ops::DerefMut for CharCountingIterator { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.iter } } -type Source<'a> = CountingIterator>>; +type Source<'a> = CharCountingIterator>>; pub struct TokenIterator<'a> { trie: Tree, @@ -363,11 +371,11 @@ impl<'a> TokenIterator<'a> { } fn peekable_source(&self) -> Source<'a> { - CountingIterator::from(self.source[self.offset..].chars().peekable()) + CharCountingIterator::from(self.source[self.offset..].chars().peekable()) } fn parse(&mut self) -> Option { - let mut iter = CountingIterator::from(self.source[self.offset..].chars()); + let mut iter = CharCountingIterator::from(self.source[self.offset..].chars()); match self.trie.get_closest(&mut iter) { Some(token) => { @@ -407,12 +415,8 @@ impl<'a> TokenIterator<'a> { } count } -} -impl<'a> Iterator for TokenIterator<'a> { - type Item = (Token, &'a str); - - fn next(&mut self) -> Option { + fn next_token(&mut self) -> Option<(Token, Range)> { // skip whitespace self.skip_whitespaces(); @@ -425,19 +429,19 @@ impl<'a> Iterator for TokenIterator<'a> { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); - Some((token, &self.source[start..self.offset])) + Some(token) } Some('.') if cursor.next().map_or(false, is_things::is_digit) => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); - Some((token, &self.source[start..self.offset])) + Some(token) } Some('\'' | '"') => { let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; self.offset += source.offset(); - Some((token, &self.source[start..self.offset])) + Some(token) } Some('`') => { // raw identifier @@ -445,10 +449,10 @@ impl<'a> Iterator for TokenIterator<'a> { self.skip_while(|c| is_things::is_id_continue(c)); if self.peekable_source().next() == Some('`') { self.skip(1); - Some((Token::Ident, &self.source[start..self.offset])) + Some(Token::Ident) } else { // unterminated raw identifier - Some((Token::ParseError, &self.source[start..self.offset])) + Some(Token::ParseError) } } // `//`-style comments or doc-comments @@ -465,9 +469,9 @@ impl<'a> Iterator for TokenIterator<'a> { }) { Some(tok) => { if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { - Some((Token::Ident, &self.source[start..self.offset])) + Some(Token::Ident) } else { - Some((tok, &self.source[start..self.offset])) + Some(tok) } } None => { @@ -478,15 +482,49 @@ impl<'a> Iterator for TokenIterator<'a> { { self.skip(1); self.skip_while(|c| is_things::is_id_continue(c)); - Some((Token::Ident, &self.source[start..self.offset])) + Some(Token::Ident) } else { None } } }, - }; + }?; - token + Some((token, start..self.offset)) + } + + fn next_token_item(&mut self) -> Option> { + let (token, range) = self.next_token()?; + let lexeme = &self.source[range.clone()]; + Some(TokenItem { + token, + lexeme, + offset: range.start as u32, + }) + } + + pub fn into_token_items(self) -> TokenItemIterator<'a> { + TokenItemIterator { inner: self } + } +} + +impl<'a> Iterator for TokenIterator<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + self.next_token().map(|(token, _)| token) + } +} + +pub struct TokenItemIterator<'a> { + inner: TokenIterator<'a>, +} + +impl<'a> Iterator for TokenItemIterator<'a> { + type Item = TokenItem<'a>; + + fn next(&mut self) -> Option { + self.inner.next_token_item() } } @@ -500,27 +538,27 @@ mod tests { fn test_iterator() { let tokens = "fn let void+(+bool)"; let mut lexer = TokenIterator::new(&tokens); - assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); - assert_eq!(lexer.next(), Some((Token::Let, "let"))); - assert_eq!(lexer.next(), Some((Token::Void, "void"))); - assert_eq!(lexer.next(), Some((Token::Plus, "+"))); - assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); - assert_eq!(lexer.next(), Some((Token::Plus, "+"))); - assert_eq!(lexer.next(), Some((Token::Bool, "bool"))); - assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); + assert_eq!(lexer.next(), Some(Token::Fn)); + assert_eq!(lexer.next(), Some(Token::Let)); + assert_eq!(lexer.next(), Some(Token::Void)); + assert_eq!(lexer.next(), Some(Token::Plus)); + assert_eq!(lexer.next(), Some(Token::OpenParens)); + assert_eq!(lexer.next(), Some(Token::Plus)); + assert_eq!(lexer.next(), Some(Token::Bool)); + assert_eq!(lexer.next(), Some(Token::CloseParens)); assert_eq!(lexer.next(), None); } #[test] fn idents() { - let lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123"); - assert!(lexer.map(|(tok, _)| tok).all(|tok| tok == Token::Ident)); + let mut lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123"); + assert!(lexer.all(|tok| tok == Token::Ident)); } #[test] fn ident_minus_ambiguity() { let lexer = TokenIterator::new("a-a a- - a -a --a"); - let tokens = lexer.map(|(tok, _)| tok).collect::>(); + let tokens = lexer.collect::>(); assert_eq!( tokens, vec![ @@ -537,27 +575,30 @@ mod tests { #[test] fn complex_iterator() { let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; - let mut lexer = TokenIterator::new(&tokens); - assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); - assert_eq!(lexer.next(), Some((Token::Ident, "my-function"))); - assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); - assert_eq!(lexer.next(), Some((Token::Ident, "x"))); - assert_eq!(lexer.next(), Some((Token::Colon, ":"))); - assert_eq!(lexer.next(), Some((Token::I32, "i32"))); - assert_eq!(lexer.next(), Some((Token::Comma, ","))); - assert_eq!(lexer.next(), Some((Token::Ident, "y"))); - assert_eq!(lexer.next(), Some((Token::Colon, ":"))); - assert_eq!(lexer.next(), Some((Token::F32, "f32"))); - assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); - assert_eq!(lexer.next(), Some((Token::MinusGreater, "->"))); - assert_eq!(lexer.next(), Some((Token::F32, "f32"))); - assert_eq!(lexer.next(), Some((Token::OpenBrace, "{"))); - assert_eq!(lexer.next(), Some((Token::Return, "return"))); - assert_eq!(lexer.next(), Some((Token::Ident, "x"))); - assert_eq!(lexer.next(), Some((Token::Plus, "+"))); - assert_eq!(lexer.next(), Some((Token::Ident, "y"))); - assert_eq!(lexer.next(), Some((Token::Semi, ";"))); - assert_eq!(lexer.next(), Some((Token::CloseBrace, "}"))); - assert_eq!(lexer.next(), None); + let lexer = TokenIterator::new(&tokens); + let mut items = lexer + .into_token_items() + .map(|item| (item.token, item.lexeme)); + assert_eq!(items.next(), Some((Token::Fn, "fn"))); + assert_eq!(items.next(), Some((Token::Ident, "my-function"))); + assert_eq!(items.next(), Some((Token::OpenParens, "("))); + assert_eq!(items.next(), Some((Token::Ident, "x"))); + assert_eq!(items.next(), Some((Token::Colon, ":"))); + assert_eq!(items.next(), Some((Token::I32, "i32"))); + assert_eq!(items.next(), Some((Token::Comma, ","))); + assert_eq!(items.next(), Some((Token::Ident, "y"))); + assert_eq!(items.next(), Some((Token::Colon, ":"))); + assert_eq!(items.next(), Some((Token::F32, "f32"))); + assert_eq!(items.next(), Some((Token::CloseParens, ")"))); + assert_eq!(items.next(), Some((Token::MinusGreater, "->"))); + assert_eq!(items.next(), Some((Token::F32, "f32"))); + assert_eq!(items.next(), Some((Token::OpenBrace, "{"))); + assert_eq!(items.next(), Some((Token::Return, "return"))); + assert_eq!(items.next(), Some((Token::Ident, "x"))); + assert_eq!(items.next(), Some((Token::Plus, "+"))); + assert_eq!(items.next(), Some((Token::Ident, "y"))); + assert_eq!(items.next(), Some((Token::Semi, ";"))); + assert_eq!(items.next(), Some((Token::CloseBrace, "}"))); + assert_eq!(items.next(), None); } }