Compare commits

..

4 commits

Author SHA1 Message Date
janis 2790bc561f
more refactoring 2025-09-27 18:20:52 +02:00
janis 6e0fed0962
raw identifiers, comment out -- and ++ tokens 2025-09-27 17:07:50 +02:00
janis 122f8ff7f1
compiler wranings 2025-09-26 14:58:39 +02:00
janis 2e6b8b0cc3
add integer types 2025-09-26 14:52:53 +02:00
2 changed files with 156 additions and 238 deletions

View file

@ -18,8 +18,6 @@ pub enum Error {
FloatingConstantInvalidTrailingType, FloatingConstantInvalidTrailingType,
#[error("Invalid token.")] #[error("Invalid token.")]
InvalidToken, InvalidToken,
#[error("Identifier starts with invalid character.")]
ExpectedIdStartForIdentifier,
#[error("Unknown suffix in constant.")] #[error("Unknown suffix in constant.")]
NumericalConstantUnknownSuffix, NumericalConstantUnknownSuffix,
} }
@ -73,6 +71,8 @@ impl Radix {
Radix::Dec => Token::IntegerConstant, Radix::Dec => Token::IntegerConstant,
} }
} }
#[expect(dead_code)]
pub fn from_token(token: Token) -> Option<Self> { pub fn from_token(token: Token) -> Option<Self> {
match token { match token {
Token::IntegerHexConstant => Some(Radix::Hex), Token::IntegerHexConstant => Some(Radix::Hex),
@ -82,6 +82,8 @@ impl Radix {
_ => None, _ => None,
} }
} }
#[expect(dead_code)]
pub fn map_digit(self, c: char) -> u8 { pub fn map_digit(self, c: char) -> u8 {
match self { match self {
Radix::Hex => match c { Radix::Hex => match c {
@ -104,6 +106,8 @@ impl Radix {
}, },
} }
} }
#[expect(dead_code)]
pub fn folding_method(self) -> fn(u64, char) -> u64 { pub fn folding_method(self) -> fn(u64, char) -> u64 {
match self { match self {
Radix::Hex => { Radix::Hex => {
@ -152,10 +156,10 @@ impl Radix {
} }
pub fn is_digit(self) -> fn(char) -> bool { pub fn is_digit(self) -> fn(char) -> bool {
match self { match self {
Radix::Hex => crate::is_things::is_hex_digit, Radix::Hex => is_things::is_hex_digit,
Radix::Bin => crate::is_things::is_bin_digit, Radix::Bin => is_things::is_bin_digit,
Radix::Oct => crate::is_things::is_oct_digit, Radix::Oct => is_things::is_oct_digit,
Radix::Dec => crate::is_things::is_digit, Radix::Dec => is_things::is_digit,
} }
} }
} }
@ -193,11 +197,7 @@ fn try_parse_integral_type(source: &mut Source) -> Result<Option<()>> {
return Ok(None); return Ok(None);
} }
if source if source.take_while_ref(|&c| is_things::is_digit(c)).count() <= 0 {
.take_while_ref(|&c| crate::is_things::is_digit(c))
.count()
<= 0
{
return Err(Error::IntegralTypeExpectedDigit); return Err(Error::IntegralTypeExpectedDigit);
}; };
@ -214,7 +214,7 @@ fn try_parse_exp_part(source: &mut Source) -> Result<Option<()>> {
if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() { if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() {
let _sign = source.next_if(|&c| c == '-' || c == '+'); let _sign = source.next_if(|&c| c == '-' || c == '+');
if source if source
.take_while_ref(|&c| crate::is_things::is_digit(c)) .take_while_ref(|&c| is_things::is_digit(c))
.count() .count()
.lt(&1) .lt(&1)
{ {
@ -300,7 +300,7 @@ pub(crate) fn parse_constant(source: &mut Source) -> Result<Token> {
// char following a constant must not be id_continue // char following a constant must not be id_continue
if source if source
.peek() .peek()
.map(|&c| crate::is_things::is_id_continue(c)) .map(|&c| is_things::is_id_continue(c))
.unwrap_or(false) .unwrap_or(false)
{ {
return Err(Error::NumericalConstantUnknownSuffix); return Err(Error::NumericalConstantUnknownSuffix);

View file

@ -58,12 +58,10 @@ mod is_things {
} }
} }
#[expect(dead_code)]
pub fn is_digit(ch: char) -> bool { pub fn is_digit(ch: char) -> bool {
('0'..='9').contains(&ch) ('0'..='9').contains(&ch)
} }
#[expect(dead_code)]
pub fn is_bin_digit(ch: char) -> bool { pub fn is_bin_digit(ch: char) -> bool {
ch == '0' || ch == '1' ch == '0' || ch == '1'
} }
@ -73,12 +71,10 @@ mod is_things {
('1'..='9').contains(&ch) ('1'..='9').contains(&ch)
} }
#[expect(dead_code)]
pub fn is_oct_digit(ch: char) -> bool { pub fn is_oct_digit(ch: char) -> bool {
('0'..='7').contains(&ch) ('0'..='7').contains(&ch)
} }
#[expect(dead_code)]
pub fn is_hex_digit(ch: char) -> bool { pub fn is_hex_digit(ch: char) -> bool {
('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch) ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
} }
@ -188,6 +184,16 @@ tokens!(pub Token: {
F64 => "f64", F64 => "f64",
ISize => "isize", ISize => "isize",
USize => "usize", USize => "usize",
U1 => "u1",
U8 => "u8",
U16 => "u16",
U32 => "u32",
U64 => "u64",
I1 => "i1",
I8 => "i8",
I16 => "i16",
I32 => "i32",
I64 => "i64",
Const => "const", Const => "const",
Volatile => "volatile", Volatile => "volatile",
Noalias => "noalias", Noalias => "noalias",
@ -211,9 +217,9 @@ tokens!(pub Token: {
Bang => "!", Bang => "!",
Tilde => "~", Tilde => "~",
Plus => "+", Plus => "+",
PlusPlus => "++", // PlusPlus => "++",
Minus => "-", Minus => "-",
MinusMinus => "--", // MinusMinus => "--",
Star => "*", Star => "*",
Slash => "/", Slash => "/",
Percent => "%", Percent => "%",
@ -291,185 +297,29 @@ impl Token {
} }
} }
/// A list of lexemes used by the `LexemeParser`. use std::ops::Range;
/// `lexemes` contains every token that has a defined lexeme, such as `fn`, `f32`, `const`, etc.
/// The `LexemeList` keeps track of two offsets into the `lexemes` array,
/// splitting it into three windows:
/// - [0, start_candidates) - tokens that are still being considered for parsing
/// - [start_candidates, end_candidates) - the tokens which this lexeme matches
/// - [end_candidates, len) - tokens that have been filtered out and are no longer considered
/// On each iteration of the parsing loop, the remaining tokens are matched
/// against the next character and, if they match completely, are swapped into
/// the candidates window, or swapped to the end if they don't.
struct LexemeList {
lexemes: Box<[Token]>,
start_candidates: usize,
end_candidates: usize,
filtered: Vec<(usize, FilterResult)>,
}
enum FilterResult {
Remove,
Candidate,
}
impl LexemeList {
fn new() -> Self {
let lexemes = Token::lexemes()
.iter()
.map(|(tok, _)| tok.clone())
.collect::<Box<_>>();
Self {
start_candidates: lexemes.len(),
end_candidates: lexemes.len(),
lexemes,
filtered: Vec::new(),
}
}
fn clear(&mut self) {
self.start_candidates = self.lexemes.len();
self.end_candidates = self.lexemes.len();
}
fn remaining(&self) -> &[Token] {
&self.lexemes[0..self.start_candidates]
}
fn candidates(&self) -> &[Token] {
&self.lexemes[self.start_candidates..self.end_candidates]
}
fn step(&mut self, ch: char, pos: usize) {
// smartly reuse allocation for `filtered`
// truly one of the premature optimizations.
// but it just feels good, innit?
let mut filtered = core::mem::take(&mut self.filtered);
self.remaining()
.iter()
.enumerate()
.filter_map(|(i, tok)| {
let bytes = tok.lexeme().unwrap().as_bytes();
// SAFETY: all tokens in `self.remaining()` are lexical tokens, and
// they are all valid ascii
let c = unsafe {
// TODO: maybe keep a list of `Char<'_>`s around in order to
// support fully utf8 tokens?
char::from_u32_unchecked(bytes[pos] as u32)
};
match c == ch {
false => Some((i, FilterResult::Remove)),
true if bytes.len() <= pos + 1 => Some((i, FilterResult::Candidate)),
true => None,
}
})
.collect_into(&mut filtered);
// iterate in reverse order so that we can safely swap elements
// drain here so that we can possibly reuse the `filtered` Vec allcoation
filtered.drain(..).rev().for_each(|(i, f)| {
match f {
// for candidates, swap the candidate with the last remaining
// token, then dec `start_candidates`
FilterResult::Candidate => {
// SAFETY: we know that `i` and `self.start_candidates - 1`
// are both valid indices: `self.start_candidates` starts at
// the end and each time it is decremented, one more element
// is removed from the front, so that as long as an element
// is remaining, `self.start_candidates` is always greater
// than 0.
// the order of the remaining elements is not meaningfully
// impacted because we only ever swap with elements after
// `i`, and `i` is the greatest index we will touch.
unsafe {
self.lexemes.swap_unchecked(i, self.start_candidates - 1);
self.start_candidates = self.start_candidates.saturating_sub(1);
}
}
// for removes, swap the last candidate with the last remainign
// token, then swap the remove with the last candidate, then dec
// `end_candidates` and `start_candidates`
FilterResult::Remove => {
unsafe {
// in the case that `start_candidates` ==
// `end_candidates`, no swap happens and that's fine.
// remove this: v
// [a,b,c][d,e,f][g,h,i]
// swap these: ^ ^
// [a,b,f][d,e,c][g,h,i]
// swap these: ^ ^
// [a,c,f][d,e,b][g,h,i]
// decrement both counters:
// [a,c][f,d,e][b,g,h,i]
self.lexemes
.swap_unchecked(self.start_candidates - 1, self.end_candidates - 1);
self.lexemes.swap_unchecked(i, self.end_candidates - 1);
self.start_candidates = self.start_candidates.saturating_sub(1);
self.end_candidates = self.end_candidates.saturating_sub(1);
}
}
}
});
// replace `filtered`
self.filtered = filtered;
}
}
/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
/// integral types, constants or identifiers are not parsed by this.
pub struct LexemeParser {
lexemes: LexemeList,
len: usize,
}
impl LexemeParser {
pub fn new() -> Self {
Self {
lexemes: LexemeList::new(),
len: 0,
}
}
pub fn parse(&mut self, mut tokens: impl Iterator<Item = char>) -> Option<Token> {
self.lexemes.clear();
loop {
let Some(ch) = tokens.next() else {
break;
};
if crate::is_things::is_whitespace(ch) {
break;
}
self.lexemes.step(ch, self.len);
if self.lexemes.remaining().is_empty() {
break;
}
}
self.lexemes.candidates().last().copied()
}
}
use itertools::Itertools;
use trie::Tree; use trie::Tree;
pub struct TokenItem<'a> {
pub token: Token,
pub lexeme: &'a str,
pub offset: u32,
}
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
struct CountingIterator<I: Iterator> { struct CharCountingIterator<I: Iterator> {
iter: I, iter: I,
count: usize, count: usize,
} }
impl<I: Iterator> From<I> for CountingIterator<I> { impl<I: Iterator> From<I> for CharCountingIterator<I> {
fn from(iter: I) -> Self { fn from(iter: I) -> Self {
Self { iter, count: 0 } Self { iter, count: 0 }
} }
} }
impl<I: Iterator<Item = char>> Iterator for CountingIterator<I> { impl<I: Iterator<Item = char>> Iterator for CharCountingIterator<I> {
type Item = I::Item; type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
@ -477,13 +327,13 @@ impl<I: Iterator<Item = char>> Iterator for CountingIterator<I> {
} }
} }
impl<I: Iterator> CountingIterator<I> { impl<I: Iterator> CharCountingIterator<I> {
pub(crate) fn offset(&self) -> usize { pub(crate) fn offset(&self) -> usize {
self.count self.count
} }
} }
impl<I: Iterator> core::ops::Deref for CountingIterator<I> { impl<I: Iterator> core::ops::Deref for CharCountingIterator<I> {
type Target = I; type Target = I;
fn deref(&self) -> &Self::Target { fn deref(&self) -> &Self::Target {
@ -491,13 +341,13 @@ impl<I: Iterator> core::ops::Deref for CountingIterator<I> {
} }
} }
impl<I: Iterator> core::ops::DerefMut for CountingIterator<I> { impl<I: Iterator> core::ops::DerefMut for CharCountingIterator<I> {
fn deref_mut(&mut self) -> &mut Self::Target { fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.iter &mut self.iter
} }
} }
type Source<'a> = CountingIterator<core::iter::Peekable<core::str::Chars<'a>>>; type Source<'a> = CharCountingIterator<core::iter::Peekable<core::str::Chars<'a>>>;
pub struct TokenIterator<'a> { pub struct TokenIterator<'a> {
trie: Tree<char, Token>, trie: Tree<char, Token>,
@ -521,11 +371,11 @@ impl<'a> TokenIterator<'a> {
} }
fn peekable_source(&self) -> Source<'a> { fn peekable_source(&self) -> Source<'a> {
CountingIterator::from(self.source[self.offset..].chars().peekable()) CharCountingIterator::from(self.source[self.offset..].chars().peekable())
} }
fn parse(&mut self) -> Option<Token> { fn parse(&mut self) -> Option<Token> {
let mut iter = CountingIterator::from(self.source[self.offset..].chars()); let mut iter = CharCountingIterator::from(self.source[self.offset..].chars());
match self.trie.get_closest(&mut iter) { match self.trie.get_closest(&mut iter) {
Some(token) => { Some(token) => {
@ -565,12 +415,8 @@ impl<'a> TokenIterator<'a> {
} }
count count
} }
}
impl<'a> Iterator for TokenIterator<'a> { fn next_token(&mut self) -> Option<(Token, Range<usize>)> {
type Item = (Token, &'a str);
fn next(&mut self) -> Option<Self::Item> {
// skip whitespace // skip whitespace
self.skip_whitespaces(); self.skip_whitespaces();
@ -583,36 +429,49 @@ impl<'a> Iterator for TokenIterator<'a> {
let token = complex_tokens::parse_constant(&mut source).ok()?; let token = complex_tokens::parse_constant(&mut source).ok()?;
self.offset += source.offset(); self.offset += source.offset();
Some((token, &self.source[start..self.offset])) Some(token)
} }
Some('.') if cursor.next().map_or(false, is_things::is_digit) => { Some('.') if cursor.next().map_or(false, is_things::is_digit) => {
let token = complex_tokens::parse_constant(&mut source).ok()?; let token = complex_tokens::parse_constant(&mut source).ok()?;
self.offset += source.offset(); self.offset += source.offset();
Some((token, &self.source[start..self.offset])) Some(token)
} }
Some('\'' | '"') => { Some('\'' | '"') => {
let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?;
self.offset += source.offset(); self.offset += source.offset();
Some((token, &self.source[start..self.offset])) Some(token)
} }
Some('`') => {
// raw identifier
self.skip(1);
self.skip_while(|c| is_things::is_id_continue(c));
if self.peekable_source().next() == Some('`') {
self.skip(1);
Some(Token::Ident)
} else {
// unterminated raw identifier
Some(Token::ParseError)
}
}
// `//`-style comments or doc-comments
_ => match self.parse().map(|tok| match tok { _ => match self.parse().map(|tok| match tok {
Token::SlashSlash => { Token::SlashSlash => {
self.skip_while(|c| c == '\n'); self.skip_while(|c| c == '\n');
(Token::Comment) Token::Comment
} }
Token::SlashSlashSlash => { Token::SlashSlashSlash => {
self.skip_while(|c| c == '\n'); self.skip_while(|c| c == '\n');
(Token::DocComment) Token::DocComment
} }
_ => tok, _ => tok,
}) { }) {
Some(tok) => { Some(tok) => {
if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 {
Some((Token::Ident, &self.source[start..self.offset])) Some(Token::Ident)
} else { } else {
Some((tok, &self.source[start..self.offset])) Some(tok)
} }
} }
None => { None => {
@ -623,15 +482,49 @@ impl<'a> Iterator for TokenIterator<'a> {
{ {
self.skip(1); self.skip(1);
self.skip_while(|c| is_things::is_id_continue(c)); self.skip_while(|c| is_things::is_id_continue(c));
Some((Token::Ident, &self.source[start..self.offset])) Some(Token::Ident)
} else { } else {
None None
} }
} }
}, },
}; }?;
token Some((token, start..self.offset))
}
fn next_token_item(&mut self) -> Option<TokenItem<'a>> {
let (token, range) = self.next_token()?;
let lexeme = &self.source[range.clone()];
Some(TokenItem {
token,
lexeme,
offset: range.start as u32,
})
}
pub fn into_token_items(self) -> TokenItemIterator<'a> {
TokenItemIterator { inner: self }
}
}
impl<'a> Iterator for TokenIterator<'a> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
self.next_token().map(|(token, _)| token)
}
}
pub struct TokenItemIterator<'a> {
inner: TokenIterator<'a>,
}
impl<'a> Iterator for TokenItemIterator<'a> {
type Item = TokenItem<'a>;
fn next(&mut self) -> Option<Self::Item> {
self.inner.next_token_item()
} }
} }
@ -643,44 +536,69 @@ mod tests {
#[test] #[test]
fn test_iterator() { fn test_iterator() {
let tokens = "fn let void+++(++bool)"; let tokens = "fn let void+(+bool)";
let mut lexer = TokenIterator::new(&tokens); let mut lexer = TokenIterator::new(&tokens);
assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); assert_eq!(lexer.next(), Some(Token::Fn));
assert_eq!(lexer.next(), Some((Token::Let, "let"))); assert_eq!(lexer.next(), Some(Token::Let));
assert_eq!(lexer.next(), Some((Token::Void, "void"))); assert_eq!(lexer.next(), Some(Token::Void));
assert_eq!(lexer.next(), Some((Token::PlusPlus, "++"))); assert_eq!(lexer.next(), Some(Token::Plus));
assert_eq!(lexer.next(), Some((Token::Plus, "+"))); assert_eq!(lexer.next(), Some(Token::OpenParens));
assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); assert_eq!(lexer.next(), Some(Token::Plus));
assert_eq!(lexer.next(), Some((Token::PlusPlus, "++"))); assert_eq!(lexer.next(), Some(Token::Bool));
assert_eq!(lexer.next(), Some((Token::Bool, "bool"))); assert_eq!(lexer.next(), Some(Token::CloseParens));
assert_eq!(lexer.next(), Some((Token::CloseParens, ")")));
assert_eq!(lexer.next(), None); assert_eq!(lexer.next(), None);
} }
#[test]
fn idents() {
let mut lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123");
assert!(lexer.all(|tok| tok == Token::Ident));
}
#[test]
fn ident_minus_ambiguity() {
let lexer = TokenIterator::new("a-a a- - a -a --a");
let tokens = lexer.collect::<Vec<_>>();
assert_eq!(
tokens,
vec![
Token::Ident,
Token::Ident,
Token::Minus,
Token::Ident,
Token::Ident,
Token::Ident
]
);
}
#[test] #[test]
fn complex_iterator() { fn complex_iterator() {
let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }";
let mut lexer = TokenIterator::new(&tokens); let lexer = TokenIterator::new(&tokens);
assert_eq!(lexer.next(), Some((Token::Fn, "fn"))); let mut items = lexer
assert_eq!(lexer.next(), Some((Token::Ident, "my-function"))); .into_token_items()
assert_eq!(lexer.next(), Some((Token::OpenParens, "("))); .map(|item| (item.token, item.lexeme));
assert_eq!(lexer.next(), Some((Token::Ident, "x"))); assert_eq!(items.next(), Some((Token::Fn, "fn")));
assert_eq!(lexer.next(), Some((Token::Colon, ":"))); assert_eq!(items.next(), Some((Token::Ident, "my-function")));
assert_eq!(lexer.next(), Some((Token::Ident, "i32"))); assert_eq!(items.next(), Some((Token::OpenParens, "(")));
assert_eq!(lexer.next(), Some((Token::Comma, ","))); assert_eq!(items.next(), Some((Token::Ident, "x")));
assert_eq!(lexer.next(), Some((Token::Ident, "y"))); assert_eq!(items.next(), Some((Token::Colon, ":")));
assert_eq!(lexer.next(), Some((Token::Colon, ":"))); assert_eq!(items.next(), Some((Token::I32, "i32")));
assert_eq!(lexer.next(), Some((Token::F32, "f32"))); assert_eq!(items.next(), Some((Token::Comma, ",")));
assert_eq!(lexer.next(), Some((Token::CloseParens, ")"))); assert_eq!(items.next(), Some((Token::Ident, "y")));
assert_eq!(lexer.next(), Some((Token::MinusGreater, "->"))); assert_eq!(items.next(), Some((Token::Colon, ":")));
assert_eq!(lexer.next(), Some((Token::F32, "f32"))); assert_eq!(items.next(), Some((Token::F32, "f32")));
assert_eq!(lexer.next(), Some((Token::OpenBrace, "{"))); assert_eq!(items.next(), Some((Token::CloseParens, ")")));
assert_eq!(lexer.next(), Some((Token::Return, "return"))); assert_eq!(items.next(), Some((Token::MinusGreater, "->")));
assert_eq!(lexer.next(), Some((Token::Ident, "x"))); assert_eq!(items.next(), Some((Token::F32, "f32")));
assert_eq!(lexer.next(), Some((Token::Plus, "+"))); assert_eq!(items.next(), Some((Token::OpenBrace, "{")));
assert_eq!(lexer.next(), Some((Token::Ident, "y"))); assert_eq!(items.next(), Some((Token::Return, "return")));
assert_eq!(lexer.next(), Some((Token::Semi, ";"))); assert_eq!(items.next(), Some((Token::Ident, "x")));
assert_eq!(lexer.next(), Some((Token::CloseBrace, "}"))); assert_eq!(items.next(), Some((Token::Plus, "+")));
assert_eq!(lexer.next(), None); assert_eq!(items.next(), Some((Token::Ident, "y")));
assert_eq!(items.next(), Some((Token::Semi, ";")));
assert_eq!(items.next(), Some((Token::CloseBrace, "}")));
assert_eq!(items.next(), None);
} }
} }