diff --git a/crates/lexer/src/complex_tokens.rs b/crates/lexer/src/complex_tokens.rs index 230539c..802f4ae 100644 --- a/crates/lexer/src/complex_tokens.rs +++ b/crates/lexer/src/complex_tokens.rs @@ -18,8 +18,6 @@ pub enum Error { FloatingConstantInvalidTrailingType, #[error("Invalid token.")] InvalidToken, - #[error("Identifier starts with invalid character.")] - ExpectedIdStartForIdentifier, #[error("Unknown suffix in constant.")] NumericalConstantUnknownSuffix, } @@ -73,6 +71,8 @@ impl Radix { Radix::Dec => Token::IntegerConstant, } } + + #[expect(dead_code)] pub fn from_token(token: Token) -> Option { match token { Token::IntegerHexConstant => Some(Radix::Hex), @@ -82,6 +82,8 @@ impl Radix { _ => None, } } + + #[expect(dead_code)] pub fn map_digit(self, c: char) -> u8 { match self { Radix::Hex => match c { @@ -104,6 +106,8 @@ impl Radix { }, } } + + #[expect(dead_code)] pub fn folding_method(self) -> fn(u64, char) -> u64 { match self { Radix::Hex => { @@ -152,10 +156,10 @@ impl Radix { } pub fn is_digit(self) -> fn(char) -> bool { match self { - Radix::Hex => crate::is_things::is_hex_digit, - Radix::Bin => crate::is_things::is_bin_digit, - Radix::Oct => crate::is_things::is_oct_digit, - Radix::Dec => crate::is_things::is_digit, + Radix::Hex => is_things::is_hex_digit, + Radix::Bin => is_things::is_bin_digit, + Radix::Oct => is_things::is_oct_digit, + Radix::Dec => is_things::is_digit, } } } @@ -193,11 +197,7 @@ fn try_parse_integral_type(source: &mut Source) -> Result> { return Ok(None); } - if source - .take_while_ref(|&c| crate::is_things::is_digit(c)) - .count() - <= 0 - { + if source.take_while_ref(|&c| is_things::is_digit(c)).count() <= 0 { return Err(Error::IntegralTypeExpectedDigit); }; @@ -214,7 +214,7 @@ fn try_parse_exp_part(source: &mut Source) -> Result> { if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() { let _sign = source.next_if(|&c| c == '-' || c == '+'); if source - .take_while_ref(|&c| crate::is_things::is_digit(c)) + .take_while_ref(|&c| is_things::is_digit(c)) .count() .lt(&1) { @@ -300,7 +300,7 @@ pub(crate) fn parse_constant(source: &mut Source) -> Result { // char following a constant must not be id_continue if source .peek() - .map(|&c| crate::is_things::is_id_continue(c)) + .map(|&c| is_things::is_id_continue(c)) .unwrap_or(false) { return Err(Error::NumericalConstantUnknownSuffix); diff --git a/crates/lexer/src/lib.rs b/crates/lexer/src/lib.rs index a2af80f..5fd50f8 100644 --- a/crates/lexer/src/lib.rs +++ b/crates/lexer/src/lib.rs @@ -58,12 +58,10 @@ mod is_things { } } - #[expect(dead_code)] pub fn is_digit(ch: char) -> bool { ('0'..='9').contains(&ch) } - #[expect(dead_code)] pub fn is_bin_digit(ch: char) -> bool { ch == '0' || ch == '1' } @@ -73,12 +71,10 @@ mod is_things { ('1'..='9').contains(&ch) } - #[expect(dead_code)] pub fn is_oct_digit(ch: char) -> bool { ('0'..='7').contains(&ch) } - #[expect(dead_code)] pub fn is_hex_digit(ch: char) -> bool { ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch) } @@ -301,170 +297,6 @@ impl Token { } } -/// A list of lexemes used by the `LexemeParser`. -/// `lexemes` contains every token that has a defined lexeme, such as `fn`, `f32`, `const`, etc. -/// The `LexemeList` keeps track of two offsets into the `lexemes` array, -/// splitting it into three windows: -/// - [0, start_candidates) - tokens that are still being considered for parsing -/// - [start_candidates, end_candidates) - the tokens which this lexeme matches -/// - [end_candidates, len) - tokens that have been filtered out and are no longer considered -/// On each iteration of the parsing loop, the remaining tokens are matched -/// against the next character and, if they match completely, are swapped into -/// the candidates window, or swapped to the end if they don't. -struct LexemeList { - lexemes: Box<[Token]>, - start_candidates: usize, - end_candidates: usize, - filtered: Vec<(usize, FilterResult)>, -} - -enum FilterResult { - Remove, - Candidate, -} - -impl LexemeList { - fn new() -> Self { - let lexemes = Token::lexemes() - .iter() - .map(|(tok, _)| tok.clone()) - .collect::>(); - - Self { - start_candidates: lexemes.len(), - end_candidates: lexemes.len(), - lexemes, - filtered: Vec::new(), - } - } - - fn clear(&mut self) { - self.start_candidates = self.lexemes.len(); - self.end_candidates = self.lexemes.len(); - } - - fn remaining(&self) -> &[Token] { - &self.lexemes[0..self.start_candidates] - } - - fn candidates(&self) -> &[Token] { - &self.lexemes[self.start_candidates..self.end_candidates] - } - - fn step(&mut self, ch: char, pos: usize) { - // smartly reuse allocation for `filtered` - // truly one of the premature optimizations. - // but it just feels good, innit? - let mut filtered = core::mem::take(&mut self.filtered); - - self.remaining() - .iter() - .enumerate() - .filter_map(|(i, tok)| { - let bytes = tok.lexeme().unwrap().as_bytes(); - // SAFETY: all tokens in `self.remaining()` are lexical tokens, and - // they are all valid ascii - let c = unsafe { - // TODO: maybe keep a list of `Char<'_>`s around in order to - // support fully utf8 tokens? - char::from_u32_unchecked(bytes[pos] as u32) - }; - match c == ch { - false => Some((i, FilterResult::Remove)), - true if bytes.len() <= pos + 1 => Some((i, FilterResult::Candidate)), - true => None, - } - }) - .collect_into(&mut filtered); - - // iterate in reverse order so that we can safely swap elements - // drain here so that we can possibly reuse the `filtered` Vec allcoation - filtered.drain(..).rev().for_each(|(i, f)| { - match f { - // for candidates, swap the candidate with the last remaining - // token, then dec `start_candidates` - FilterResult::Candidate => { - // SAFETY: we know that `i` and `self.start_candidates - 1` - // are both valid indices: `self.start_candidates` starts at - // the end and each time it is decremented, one more element - // is removed from the front, so that as long as an element - // is remaining, `self.start_candidates` is always greater - // than 0. - // the order of the remaining elements is not meaningfully - // impacted because we only ever swap with elements after - // `i`, and `i` is the greatest index we will touch. - unsafe { - self.lexemes.swap_unchecked(i, self.start_candidates - 1); - self.start_candidates = self.start_candidates.saturating_sub(1); - } - } - // for removes, swap the last candidate with the last remainign - // token, then swap the remove with the last candidate, then dec - // `end_candidates` and `start_candidates` - FilterResult::Remove => { - unsafe { - // in the case that `start_candidates` == - // `end_candidates`, no swap happens and that's fine. - // remove this: v - // [a,b,c][d,e,f][g,h,i] - // swap these: ^ ^ - // [a,b,f][d,e,c][g,h,i] - // swap these: ^ ^ - // [a,c,f][d,e,b][g,h,i] - // decrement both counters: - // [a,c][f,d,e][b,g,h,i] - self.lexemes - .swap_unchecked(self.start_candidates - 1, self.end_candidates - 1); - self.lexemes.swap_unchecked(i, self.end_candidates - 1); - self.start_candidates = self.start_candidates.saturating_sub(1); - self.end_candidates = self.end_candidates.saturating_sub(1); - } - } - } - }); - - // replace `filtered` - self.filtered = filtered; - } -} - -/// Helper type for parsing tokens that have a defined lexeme, such as `fn`, -/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive -/// integral types, constants or identifiers are not parsed by this. -pub struct LexemeParser { - lexemes: LexemeList, - len: usize, -} - -impl LexemeParser { - pub fn new() -> Self { - Self { - lexemes: LexemeList::new(), - len: 0, - } - } - - pub fn parse(&mut self, mut tokens: impl Iterator) -> Option { - self.lexemes.clear(); - loop { - let Some(ch) = tokens.next() else { - break; - }; - - if crate::is_things::is_whitespace(ch) { - break; - } - - self.lexemes.step(ch, self.len); - if self.lexemes.remaining().is_empty() { - break; - } - } - self.lexemes.candidates().last().copied() - } -} - -use itertools::Itertools; use trie::Tree; #[derive(Debug, Clone, Copy)] @@ -607,14 +439,15 @@ impl<'a> Iterator for TokenIterator<'a> { Some((token, &self.source[start..self.offset])) } + // `//`-style comments or doc-comments _ => match self.parse().map(|tok| match tok { Token::SlashSlash => { self.skip_while(|c| c == '\n'); - (Token::Comment) + Token::Comment } Token::SlashSlashSlash => { self.skip_while(|c| c == '\n'); - (Token::DocComment) + Token::DocComment } _ => tok, }) {