From 82d2eed09a919fa456e38a7a58cb6869fe94c3f0 Mon Sep 17 00:00:00 2001 From: janis Date: Fri, 19 Sep 2025 20:12:13 +0200 Subject: [PATCH] lexer crate --- Cargo.toml | 2 + crates/lexer/Cargo.toml | 2 + crates/lexer/src/complex_tokens.rs | 394 +++++++++++++++++++++++++++++ crates/lexer/src/lib.rs | 236 ++++++++++++++--- 4 files changed, 597 insertions(+), 37 deletions(-) create mode 100644 crates/lexer/src/complex_tokens.rs diff --git a/Cargo.toml b/Cargo.toml index 083abff..f96d813 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,8 @@ werkzeug = { path = "../../rust/werkzeug" } [workspace.dependencies] unicode-xid = "0.2.4" tracing = "0.1.41" +thiserror = "1.0.63" +itertools = "0.13.0" werkzeug = { path = "../../rust/werkzeug" } trie = { path = "../../rust/trie" } \ No newline at end of file diff --git a/crates/lexer/Cargo.toml b/crates/lexer/Cargo.toml index 5db2779..8084c67 100644 --- a/crates/lexer/Cargo.toml +++ b/crates/lexer/Cargo.toml @@ -6,5 +6,7 @@ edition = "2024" [dependencies] tracing = { workspace = true } werkzeug = { workspace = true } +thiserror = { workspace = true } +itertools = { workspace = true } trie = { workspace = true } unicode-xid = { workspace = true } \ No newline at end of file diff --git a/crates/lexer/src/complex_tokens.rs b/crates/lexer/src/complex_tokens.rs new file mode 100644 index 0000000..230539c --- /dev/null +++ b/crates/lexer/src/complex_tokens.rs @@ -0,0 +1,394 @@ +use crate::{Source, Token, is_things}; +use itertools::Itertools; +use werkzeug::iter::{FallibleMapIter, NextIf}; + +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum Error { + #[error("{0}")] + StringError(String), + #[error("Exp part of floating constant had no digits.")] + FloatingConstantExpPartNoDigit, + #[error("constant cannot start with leading underscore '_'.")] + NumericalConstantDigitLeadingUnderscore, + #[error("Expected digit here for constant.")] + NumericalConstantDigitNoDigit, + #[error("Expected digit here for integer constant.")] + IntegralTypeExpectedDigit, + #[error("Floating constant has invalid trailing type.")] + FloatingConstantInvalidTrailingType, + #[error("Invalid token.")] + InvalidToken, + #[error("Identifier starts with invalid character.")] + ExpectedIdStartForIdentifier, + #[error("Unknown suffix in constant.")] + NumericalConstantUnknownSuffix, +} + +type Result = core::result::Result; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Radix { + Hex, + Bin, + Dec, + Oct, +} + +impl Radix { + #[allow(unused)] + /// must be called with one of `['b','x','d','o']` + unsafe fn from_char_unchecked(c: char) -> Self { + match c.to_ascii_lowercase() { + 'o' => Self::Oct, + 'b' => Self::Bin, + 'x' => Self::Hex, + 'd' => Self::Dec, + _ => unreachable!(), + } + } + fn from_char(c: char) -> Option { + match c.to_ascii_lowercase() { + 'o' => Some(Self::Oct), + 'b' => Some(Self::Bin), + 'x' => Some(Self::Hex), + 'd' => Some(Self::Dec), + _ => None, + } + } + + #[allow(unused)] + pub fn radix(self) -> u8 { + match self { + Radix::Hex => 16, + Radix::Bin => 2, + Radix::Oct => 8, + Radix::Dec => 10, + } + } + fn to_token(self) -> Token { + match self { + Radix::Hex => Token::IntegerHexConstant, + Radix::Bin => Token::IntegerBinConstant, + Radix::Oct => Token::IntegerOctConstant, + Radix::Dec => Token::IntegerConstant, + } + } + pub fn from_token(token: Token) -> Option { + match token { + Token::IntegerHexConstant => Some(Radix::Hex), + Token::IntegerBinConstant => Some(Radix::Bin), + Token::IntegerOctConstant => Some(Radix::Oct), + Token::IntegerConstant => Some(Radix::Dec), + _ => None, + } + } + pub fn map_digit(self, c: char) -> u8 { + match self { + Radix::Hex => match c { + '0'..='9' => c as u8 - b'0', + 'a'..='f' => 10 + c as u8 - b'a', + 'A'..='F' => 10 + c as u8 - b'A', + _ => unreachable!(), + }, + Radix::Bin => match c { + '0'..='1' => c as u8 - b'0', + _ => unreachable!(), + }, + Radix::Dec => match c { + '0'..='9' => c as u8 - b'0', + _ => unreachable!(), + }, + Radix::Oct => match c { + '0'..='7' => c as u8 - b'0', + _ => unreachable!(), + }, + } + } + pub fn folding_method(self) -> fn(u64, char) -> u64 { + match self { + Radix::Hex => { + fn fold(acc: u64, c: char) -> u64 { + let digit = match c { + '0'..='9' => c as u8 - b'0', + 'a'..='f' => c as u8 - b'a', + 'A'..='F' => c as u8 - b'A', + _ => unreachable!(), + }; + acc * 16 + digit as u64 + } + fold + } + Radix::Bin => { + fn fold(acc: u64, c: char) -> u64 { + let digit = match c { + '0'..='1' => c as u8 - b'0', + _ => unreachable!(), + }; + acc * 2 + digit as u64 + } + fold + } + Radix::Dec => { + fn fold(acc: u64, c: char) -> u64 { + let digit = match c { + '0'..='9' => c as u8 - b'0', + _ => unreachable!(), + }; + acc * 10 + digit as u64 + } + fold + } + Radix::Oct => { + fn fold(acc: u64, c: char) -> u64 { + let digit = match c { + '0'..='7' => c as u8 - b'0', + _ => unreachable!(), + }; + acc * 8 + digit as u64 + } + fold + } + } + } + pub fn is_digit(self) -> fn(char) -> bool { + match self { + Radix::Hex => crate::is_things::is_hex_digit, + Radix::Bin => crate::is_things::is_bin_digit, + Radix::Oct => crate::is_things::is_oct_digit, + Radix::Dec => crate::is_things::is_digit, + } + } +} + +// where DIGIT is defined by radix: +// DIGITS <- +// if allow_leading_underscore: `_`* DIGIT (DIGIT|`_`)* +// else: DIGIT (DIGIT|`_`)* +fn parse_digit_part( + source: &mut Source, + allow_leading_underscore: bool, + radix: Radix, +) -> Result<()> { + let is_digit = radix.is_digit(); + + if allow_leading_underscore { + let _underscore = source.take_while_ref(|&c| c == '_').count(); + } + let _need_digit = source.next_if(|&c| is_digit(c)).ok_or_else(|| { + if source.peek() == Some(&'_') { + Error::NumericalConstantDigitLeadingUnderscore + } else { + Error::NumericalConstantDigitNoDigit + } + })?; + let _rest = source.take_while_ref(|&c| is_digit(c) || c == '_').count(); + + Ok(()) +} + +// IntegralType <- +// ( 'u' | 'i' ) DIGITS+ +fn try_parse_integral_type(source: &mut Source) -> Result> { + if !source.next_if(|&c| c == 'u' || c == 'i').is_some() { + return Ok(None); + } + + if source + .take_while_ref(|&c| crate::is_things::is_digit(c)) + .count() + <= 0 + { + return Err(Error::IntegralTypeExpectedDigit); + }; + + Ok(Some(())) +} + +// returns `Err(E)` if it failed to parse. +// returns `Ok(None)` if no exp part was found. +// returns `Ok(Some(()))` if an exp part was found and parsed. +// +// EXP_PART <- +// (`e`|`E`) (`-`|`+`)? DEC_DIGITS +fn try_parse_exp_part(source: &mut Source) -> Result> { + if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() { + let _sign = source.next_if(|&c| c == '-' || c == '+'); + if source + .take_while_ref(|&c| crate::is_things::is_digit(c)) + .count() + .lt(&1) + { + // need digits following exp notation + Err(Error::FloatingConstantExpPartNoDigit) + } else { + Ok(Some(())) + } + } else { + Ok(None) + } +} + +// CONSTANT <- +// DEC_DIGITS IntegralType? +// `0x` HEX_DIGITS IntegralType? +// `0b` BIN_DIGITS IntegralType? +// `0o` OCT_DIGITS IntegralType? +// DEC_DIGITS FloatingType? +// `.` DEC_DIGITS EXP_PART? FloatingType? +// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType? +fn parse_constant_inner(source: &mut Source) -> Result { + let zero = source.next_if(|&c| c == '0').is_some(); + + let radix = zero + .then(|| source.next_if_map(|c| Radix::from_char(c))) + .flatten(); + + if let Some(radix) = radix { + parse_digit_part(source, false, radix)?; + if source.peek().map(|&c| c == 'u' || c == 'i') == Some(true) { + try_parse_integral_type(source)?; + } + return Ok(radix.to_token()); + } + + // if zero: `_`* DIGIT (DIGIT|`_`)* + // else: DIGIT (DIGIT|`_`)* + _ = match parse_digit_part(source, zero, Radix::Dec) { + Ok(_) => Ok(()), + Err(Error::NumericalConstantDigitNoDigit) if zero => Ok(()), + Err(e) => Err(e), + }?; + + if let Some(_) = source.try_map_iter_if(|source| try_parse_integral_type(source))? { + return Ok(Token::IntegerConstant); + } + + let dot = source.next_if(|&c| c == '.').is_some(); + + if dot { + parse_digit_part(source, false, Radix::Dec)?; + } + + // parse exp notation + let exp = try_parse_exp_part(source)?.is_some(); + + // trailing FloatingType? + let floating = if source.next_if(|&c| c == 'f').is_some() { + let digits = source.next_tuple::<(char, char)>(); + if !(digits == Some(('6', '4')) || digits == Some(('3', '2'))) { + // need either f64 or f32 here! + return Err(Error::FloatingConstantInvalidTrailingType); + } + true + } else { + false + }; + + let token = match (dot, exp, floating) { + (false, false, false) => Token::IntegerConstant, + (true, false, _) => Token::DotFloatingConstant, + (true, true, _) => Token::DotFloatingExpConstant, + (false, true, _) => Token::FloatingExpConstant, + (false, _, _) => Token::FloatingConstant, + }; + + Ok(token) +} + +pub(crate) fn parse_constant(source: &mut Source) -> Result { + let constant = parse_constant_inner(source)?; + // char following a constant must not be id_continue + if source + .peek() + .map(|&c| crate::is_things::is_id_continue(c)) + .unwrap_or(false) + { + return Err(Error::NumericalConstantUnknownSuffix); + } + + Ok(constant) +} + +pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result { + let quote = source + .next_if(|&c| c == '"' || c == '\'') + .ok_or(Error::InvalidToken)?; + + let is_char = quote == '\''; + + let mut escaped = false; + let mut closed = false; + + while let Some(c) = source.next() { + if escaped { + // accept any escaped char + escaped = false; + continue; + } + if c == '\\' { + escaped = true; + continue; + } + if c == quote { + closed = true; + break; + } + } + + if !closed { + return Err(Error::StringError("Unterminated string/char.".into())); + } + + if is_char { + Ok(Token::CharConstant) + } else { + Ok(Token::StringConstant) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + fn make_source(s: &'_ str) -> Source<'_> { + s.chars().peekable().into() + } + + #[test] + fn parse_constant_number() { + assert_eq!( + parse_constant(&mut make_source("0x1A3F_u32")), + Ok(Token::IntegerHexConstant) + ); + assert_eq!( + parse_constant(&mut make_source("13f32")), + Ok(Token::FloatingConstant) + ); + + assert_eq!( + parse_constant(&mut make_source("0b1011_0010i16")), + Ok(Token::IntegerBinConstant) + ); + assert_eq!( + parse_constant(&mut make_source("0o755u8")), + Ok(Token::IntegerOctConstant) + ); + assert_eq!( + parse_constant(&mut make_source("42i64")), + Ok(Token::IntegerConstant) + ); + assert_eq!( + parse_constant(&mut make_source("3.14f64")), + Ok(Token::DotFloatingConstant) + ); + assert_eq!( + parse_constant(&mut make_source("2.71828e0f32")), + Ok(Token::DotFloatingExpConstant) + ); + assert_eq!( + parse_constant(&mut make_source("22e23")), + Ok(Token::FloatingExpConstant) + ); + } +} diff --git a/crates/lexer/src/lib.rs b/crates/lexer/src/lib.rs index 4ca2263..1f632dd 100644 --- a/crates/lexer/src/lib.rs +++ b/crates/lexer/src/lib.rs @@ -38,14 +38,14 @@ mod is_things { /// a formal definition of valid identifier name. pub fn is_id_start(c: char) -> bool { // This is XID_Start OR '_' (which formally is not a XID_Start). - c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) + c == '_' || c == '-' || unicode_xid::UnicodeXID::is_xid_start(c) } /// True if `c` is valid as a non-first character of an identifier. /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for /// a formal definition of valid identifier name. pub fn is_id_continue(c: char) -> bool { - unicode_xid::UnicodeXID::is_xid_continue(c) + unicode_xid::UnicodeXID::is_xid_continue(c) || c == '-' } /// The passed string is lexically an identifier. @@ -58,22 +58,27 @@ mod is_things { } } + #[expect(dead_code)] pub fn is_digit(ch: char) -> bool { ('0'..='9').contains(&ch) } + #[expect(dead_code)] pub fn is_bin_digit(ch: char) -> bool { ch == '0' || ch == '1' } + #[expect(dead_code)] pub fn is_nonzero_digit(ch: char) -> bool { ('1'..='9').contains(&ch) } + #[expect(dead_code)] pub fn is_oct_digit(ch: char) -> bool { ('0'..='7').contains(&ch) } + #[expect(dead_code)] pub fn is_hex_digit(ch: char) -> bool { ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch) } @@ -119,6 +124,11 @@ macro_rules! tokens { self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) } + /// returns the number of chars in this lexeme + $vis fn lexeme_len_utf8(&self) -> usize { + self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0) + } + $vis fn maybe_ident(&self) -> bool { self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false) } @@ -149,16 +159,15 @@ tokens!(pub Token: { DotFloatingConstant, DotFloatingExpConstant, StringConstant, - IntegralType, Ident }, // Lexical Tokens: { SlashSlash => "//", - SlashStar => "/*", - // SlashStarStar => "/**", - StarSlash => "*/", - // SlashSlashSlash => "///", + SlashSlashSlash => "///", + // SlashStar => "/*", + // SlashStarStar => "/**", + //StarSlash => "*/", // Punctuation: OpenParens => "(", CloseParens => ")", @@ -445,15 +454,59 @@ impl LexemeParser { } } -use trie::{OnceAndIter, Tree}; +use itertools::Itertools; +use trie::Tree; -pub struct LexemeIterator> { - trie: Tree, - iter: OnceAndIter, +#[derive(Debug, Clone, Copy)] +struct CountingIterator { + iter: I, + count: usize, } -impl> LexemeIterator { - pub fn new(iter: I) -> Self { +impl From for CountingIterator { + fn from(iter: I) -> Self { + Self { iter, count: 0 } + } +} + +impl> Iterator for CountingIterator { + type Item = I::Item; + + fn next(&mut self) -> Option { + self.iter.next().inspect(|c| self.count += c.len_utf8()) + } +} + +impl CountingIterator { + pub(crate) fn offset(&self) -> usize { + self.count + } +} + +impl core::ops::Deref for CountingIterator { + type Target = I; + + fn deref(&self) -> &Self::Target { + &self.iter + } +} + +impl core::ops::DerefMut for CountingIterator { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.iter + } +} + +type Source<'a> = CountingIterator>>; + +pub struct TokenIterator<'a> { + trie: Tree, + source: &'a str, + offset: usize, +} + +impl<'a> TokenIterator<'a> { + pub fn new(source: &'a str) -> Self { let mut trie = Tree::new(); for (token, token_str) in Token::lexemes() { @@ -462,63 +515,172 @@ impl> LexemeIterator { Self { trie, - iter: iter.into(), + source, + offset: 0, } } + fn peekable_source(&self) -> Source<'a> { + CountingIterator::from(self.source[self.offset..].chars().peekable()) + } + fn parse(&mut self) -> Option { - match self.trie.get_closest(&mut self.iter) { - Some((Some(key), token)) => { + let mut iter = CountingIterator::from(self.source[self.offset..].chars()); + + match self.trie.get_closest(&mut iter) { + Some(token) => { // skip the peeked item - self.iter.set_once(key); + self.offset += token.lexeme_len(); Some(*token) } - Some((None, token)) => Some(*token), None => None, } } - fn skip_whitespaces(&mut self) { - loop { - let Some(c) = self.iter.next() else { break }; + fn skip_whitespaces(&mut self) -> usize { + self.skip_while(is_things::is_whitespace) + } - if is_things::is_whitespace(c) { + fn skip(&mut self, mut n: usize) -> usize { + self.skip_while(|_| { + n -= 1; + n > 0 + }) + } + + fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) -> usize { + let mut count = 0; + loop { + let Some(c) = self.source[self.offset..].chars().next() else { + break; + }; + + if pred(c) { + self.offset += c.len_utf8(); + count += c.len_utf8(); continue; } else { - self.iter.set_once(c); break; } } + count } } -impl> Iterator for LexemeIterator { - type Item = Token; +impl<'a> Iterator for TokenIterator<'a> { + type Item = (Token, &'a str); fn next(&mut self) -> Option { // skip whitespace self.skip_whitespaces(); - self.parse() + let start = self.offset; + + let mut source = self.peekable_source(); + let mut cursor = self.peekable_source(); + let token = match cursor.next() { + Some('0'..='9') => { + let token = complex_tokens::parse_constant(&mut source).ok()?; + self.offset += source.offset(); + + Some((token, &self.source[start..self.offset])) + } + Some('.') if cursor.next().map_or(false, is_things::is_digit) => { + let token = complex_tokens::parse_constant(&mut source).ok()?; + self.offset += source.offset(); + + Some((token, &self.source[start..self.offset])) + } + Some('\'' | '"') => { + let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; + self.offset += source.offset(); + + Some((token, &self.source[start..self.offset])) + } + _ => match self.parse().map(|tok| match tok { + Token::SlashSlash => { + self.skip_while(|c| c == '\n'); + (Token::Comment) + } + Token::SlashSlashSlash => { + self.skip_while(|c| c == '\n'); + (Token::DocComment) + } + _ => tok, + }) { + Some(tok) => { + if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { + Some((Token::Ident, &self.source[start..self.offset])) + } else { + Some((tok, &self.source[start..self.offset])) + } + } + None => { + if self + .peekable_source() + .next() + .map_or(false, |c| is_things::is_id_start(c)) + { + self.skip(1); + self.skip_while(|c| is_things::is_id_continue(c)); + Some((Token::Ident, &self.source[start..self.offset])) + } else { + None + } + } + }, + }; + + token } } + +mod complex_tokens; + #[cfg(test)] mod tests { use super::*; #[test] fn test_iterator() { - let mut tokens = "fn let void+++(++bool)".chars(); - let mut lexer = LexemeIterator::new(&mut tokens); - assert_eq!(lexer.next(), Some(Token::Fn)); - assert_eq!(lexer.next(), Some(Token::Let)); - assert_eq!(lexer.next(), Some(Token::Void)); - assert_eq!(lexer.next(), Some(Token::PlusPlus)); - assert_eq!(lexer.next(), Some(Token::Plus)); - assert_eq!(lexer.next(), Some(Token::OpenParens)); - assert_eq!(lexer.next(), Some(Token::PlusPlus)); - assert_eq!(lexer.next(), Some(Token::Bool)); - assert_eq!(lexer.next(), Some(Token::CloseParens)); + let tokens = "fn let void+++(++bool)"; + let mut lexer = TokenIterator::new(&tokens); + assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); + assert_eq!(lexer.next(), Some((Token::Let, "let"))); + assert_eq!(lexer.next(), Some((Token::Void, "void"))); + assert_eq!(lexer.next(), Some((Token::PlusPlus, "++"))); + assert_eq!(lexer.next(), Some((Token::Plus, "+"))); + assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); + assert_eq!(lexer.next(), Some((Token::PlusPlus, "++"))); + assert_eq!(lexer.next(), Some((Token::Bool, "bool"))); + assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); + assert_eq!(lexer.next(), None); + } + + #[test] + fn complex_iterator() { + let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; + let mut lexer = TokenIterator::new(&tokens); + assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); + assert_eq!(lexer.next(), Some((Token::Ident, "my-function"))); + assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); + assert_eq!(lexer.next(), Some((Token::Ident, "x"))); + assert_eq!(lexer.next(), Some((Token::Colon, ":"))); + assert_eq!(lexer.next(), Some((Token::Ident, "i32"))); + assert_eq!(lexer.next(), Some((Token::Comma, ","))); + assert_eq!(lexer.next(), Some((Token::Ident, "y"))); + assert_eq!(lexer.next(), Some((Token::Colon, ":"))); + assert_eq!(lexer.next(), Some((Token::F32, "f32"))); + assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); + assert_eq!(lexer.next(), Some((Token::MinusGreater, "->"))); + assert_eq!(lexer.next(), Some((Token::F32, "f32"))); + assert_eq!(lexer.next(), Some((Token::OpenBrace, "{"))); + assert_eq!(lexer.next(), Some((Token::Return, "return"))); + assert_eq!(lexer.next(), Some((Token::Ident, "x"))); + assert_eq!(lexer.next(), Some((Token::Plus, "+"))); + assert_eq!(lexer.next(), Some((Token::Ident, "y"))); + assert_eq!(lexer.next(), Some((Token::Semi, ";"))); + assert_eq!(lexer.next(), Some((Token::CloseBrace, "}"))); assert_eq!(lexer.next(), None); } }