Compare commits

...

7 commits

Author SHA1 Message Date
janis c270fe5add
comments and docs as ast nodes 2025-10-01 00:31:08 +02:00
janis 45ba06db43
use pomelo for parsing 2025-10-01 00:24:46 +02:00
janis 816aebda01
try out pomelo parser 2025-09-30 17:29:38 +02:00
janis 357590ec07
more useless token stuff that I didn't really want anyway 2025-09-30 16:44:42 +02:00
janis df2bb54272
parser crate 2025-09-29 15:56:13 +02:00
janis ae0fb53b90
stuff.. 2025-09-29 15:56:05 +02:00
janis 260150de15
more iterator stuff 2025-09-27 22:16:04 +02:00
6 changed files with 992 additions and 120 deletions

View file

@ -2,7 +2,7 @@
resolver = "3" resolver = "3"
members = [ members = [
"crates/lexer" "crates/lexer"
] , "crates/parser"]
[package] [package]
name = "compiler" name = "compiler"
@ -34,4 +34,4 @@ thiserror = "1.0.63"
itertools = "0.13.0" itertools = "0.13.0"
werkzeug = { path = "../../rust/werkzeug" } werkzeug = { path = "../../rust/werkzeug" }
trie = { path = "../../rust/trie" } trie = { path = "../../rust/trie" }

View file

@ -9,4 +9,6 @@ werkzeug = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
itertools = { workspace = true } itertools = { workspace = true }
trie = { workspace = true } trie = { workspace = true }
unicode-xid = { workspace = true } unicode-xid = { workspace = true }
variadics_please = "1.1.0"

View file

@ -63,23 +63,12 @@ impl Radix {
Radix::Dec => 10, Radix::Dec => 10,
} }
} }
fn to_token(self) -> Token { fn to_constant_kind(self) -> ConstantKind {
match self { match self {
Radix::Hex => Token::IntegerHexConstant, Radix::Hex => ConstantKind::HexInteger,
Radix::Bin => Token::IntegerBinConstant, Radix::Bin => ConstantKind::BinInteger,
Radix::Oct => Token::IntegerOctConstant, Radix::Oct => ConstantKind::OctInteger,
Radix::Dec => Token::IntegerConstant, Radix::Dec => ConstantKind::Integer,
}
}
#[expect(dead_code)]
pub fn from_token(token: Token) -> Option<Self> {
match token {
Token::IntegerHexConstant => Some(Radix::Hex),
Token::IntegerBinConstant => Some(Radix::Bin),
Token::IntegerOctConstant => Some(Radix::Oct),
Token::IntegerConstant => Some(Radix::Dec),
_ => None,
} }
} }
@ -236,7 +225,8 @@ fn try_parse_exp_part(source: &mut Source) -> Result<Option<()>> {
// DEC_DIGITS FloatingType? // DEC_DIGITS FloatingType?
// `.` DEC_DIGITS EXP_PART? FloatingType? // `.` DEC_DIGITS EXP_PART? FloatingType?
// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType? // DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
fn parse_constant_inner(source: &mut Source) -> Result<Token> { fn parse_constant_inner(source: &mut Source) -> Result<ConstantKind> {
let start = source.count;
let zero = source.next_if(|&c| c == '0').is_some(); let zero = source.next_if(|&c| c == '0').is_some();
let radix = zero let radix = zero
@ -248,7 +238,7 @@ fn parse_constant_inner(source: &mut Source) -> Result<Token> {
if source.peek().map(|&c| c == 'u' || c == 'i') == Some(true) { if source.peek().map(|&c| c == 'u' || c == 'i') == Some(true) {
try_parse_integral_type(source)?; try_parse_integral_type(source)?;
} }
return Ok(radix.to_token()); return Ok(radix.to_constant_kind());
} }
// if zero: `_`* DIGIT (DIGIT|`_`)* // if zero: `_`* DIGIT (DIGIT|`_`)*
@ -260,7 +250,7 @@ fn parse_constant_inner(source: &mut Source) -> Result<Token> {
}?; }?;
if let Some(_) = source.try_map_iter_if(|source| try_parse_integral_type(source))? { if let Some(_) = source.try_map_iter_if(|source| try_parse_integral_type(source))? {
return Ok(Token::IntegerConstant); return Ok(ConstantKind::Integer);
} }
let dot = source.next_if(|&c| c == '.').is_some(); let dot = source.next_if(|&c| c == '.').is_some();
@ -285,17 +275,48 @@ fn parse_constant_inner(source: &mut Source) -> Result<Token> {
}; };
let token = match (dot, exp, floating) { let token = match (dot, exp, floating) {
(false, false, false) => Token::IntegerConstant, (false, false, false) => ConstantKind::Integer,
(true, false, _) => Token::DotFloatingConstant, (true, false, _) => ConstantKind::DotFloating,
(true, true, _) => Token::DotFloatingExpConstant, (true, true, _) => ConstantKind::DotFloatingExp,
(false, true, _) => Token::FloatingExpConstant, (false, true, _) => ConstantKind::FloatingExp,
(false, _, _) => Token::FloatingConstant, (false, _, _) => ConstantKind::Floating,
}; };
Ok(token) Ok(token)
} }
pub(crate) fn parse_constant(source: &mut Source) -> Result<Token> { #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ConstantKind {
Integer,
BinInteger,
OctInteger,
HexInteger,
DotFloating,
DotFloatingExp,
FloatingExp,
Floating,
Char,
String,
}
impl<'a> From<(ConstantKind, &'a str)> for Token<'a> {
fn from((value, lexeme): (ConstantKind, &'a str)) -> Self {
match value {
ConstantKind::Integer => Token::IntegerConstant(lexeme),
ConstantKind::BinInteger => Token::IntegerBinConstant(lexeme),
ConstantKind::OctInteger => Token::IntegerOctConstant(lexeme),
ConstantKind::HexInteger => Token::IntegerHexConstant(lexeme),
ConstantKind::DotFloating => Token::DotFloatingConstant(lexeme),
ConstantKind::DotFloatingExp => Token::DotFloatingExpConstant(lexeme),
ConstantKind::FloatingExp => Token::FloatingExpConstant(lexeme),
ConstantKind::Floating => Token::FloatingConstant(lexeme),
ConstantKind::Char => Token::CharConstant(lexeme),
ConstantKind::String => Token::StringConstant(lexeme),
}
}
}
pub(crate) fn parse_constant(source: &mut Source) -> Result<ConstantKind> {
let constant = parse_constant_inner(source)?; let constant = parse_constant_inner(source)?;
// char following a constant must not be id_continue // char following a constant must not be id_continue
if source if source
@ -309,7 +330,7 @@ pub(crate) fn parse_constant(source: &mut Source) -> Result<Token> {
Ok(constant) Ok(constant)
} }
pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result<Token> { pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result<ConstantKind> {
let quote = source let quote = source
.next_if(|&c| c == '"' || c == '\'') .next_if(|&c| c == '"' || c == '\'')
.ok_or(Error::InvalidToken)?; .ok_or(Error::InvalidToken)?;
@ -340,15 +361,64 @@ pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result<Token
} }
if is_char { if is_char {
Ok(Token::CharConstant) Ok(ConstantKind::Char)
} else { } else {
Ok(Token::StringConstant) Ok(ConstantKind::String)
} }
} }
/// returns `Ok(true)` if it was a doc comment (///)
pub(crate) fn parse_comment<'a>(source: &'a mut Source) -> Result<bool> {
if !(source.next() == Some('/') && source.next() == Some('/')) {
return Err(Error::InvalidToken);
}
let doc = source.next_if_eq(&'/').is_some();
eprintln!("doc comment: {doc}");
loop {
// take until new line
source
.take_while_inclusive(|&c| c != '\n')
.inspect(|c| eprintln!("skipping comment char: {c}"))
.for_each(drop);
let mut copy = source.clone();
// skip whitespaces after new line to find continuation of comment
(&mut copy)
.take_while_ref(|&c| {
eprintln!("Skipping whitespace: {c}");
is_things::is_whitespace(c) && c != '\n'
})
.for_each(drop);
if (copy.next() == Some('/')) && (copy.next() == Some('/')) {
match copy.next() {
None => break,
// docs end here, regular comment starts
Some('\n') if doc => break,
// this is a comment, so we can just take until this new line
Some('\n') if !doc => continue,
// continue doc comment
Some('/') if doc => {}
Some('/') if !doc => break,
Some(_) if doc => break,
// continue regular comment
Some(_) => {}
}
*source = copy;
} else {
break;
}
}
Ok(doc)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::complex_tokens::parse_comment;
use super::*; use super::*;
fn make_source(s: &'_ str) -> Source<'_> { fn make_source(s: &'_ str) -> Source<'_> {
@ -359,36 +429,36 @@ mod tests {
fn parse_constant_number() { fn parse_constant_number() {
assert_eq!( assert_eq!(
parse_constant(&mut make_source("0x1A3F_u32")), parse_constant(&mut make_source("0x1A3F_u32")),
Ok(Token::IntegerHexConstant) Ok(ConstantKind::HexInteger)
); );
assert_eq!( assert_eq!(
parse_constant(&mut make_source("13f32")), parse_constant(&mut make_source("13f32")),
Ok(Token::FloatingConstant) Ok(ConstantKind::Floating)
); );
assert_eq!( assert_eq!(
parse_constant(&mut make_source("0b1011_0010i16")), parse_constant(&mut make_source("0b1011_0010i16")),
Ok(Token::IntegerBinConstant) Ok(ConstantKind::BinInteger)
); );
assert_eq!( assert_eq!(
parse_constant(&mut make_source("0o755u8")), parse_constant(&mut make_source("0o755u8")),
Ok(Token::IntegerOctConstant) Ok(ConstantKind::OctInteger)
); );
assert_eq!( assert_eq!(
parse_constant(&mut make_source("42i64")), parse_constant(&mut make_source("42i64")),
Ok(Token::IntegerConstant) Ok(ConstantKind::Integer)
); );
assert_eq!( assert_eq!(
parse_constant(&mut make_source("3.14f64")), parse_constant(&mut make_source("3.14f64")),
Ok(Token::DotFloatingConstant) Ok(ConstantKind::DotFloating)
); );
assert_eq!( assert_eq!(
parse_constant(&mut make_source("2.71828e0f32")), parse_constant(&mut make_source("2.71828e0f32")),
Ok(Token::DotFloatingExpConstant) Ok(ConstantKind::DotFloatingExp)
); );
assert_eq!( assert_eq!(
parse_constant(&mut make_source("22e23")), parse_constant(&mut make_source("22e23")),
Ok(Token::FloatingExpConstant) Ok(ConstantKind::FloatingExp)
); );
} }
} }

View file

@ -1,4 +1,4 @@
#![feature(slice_swap_unchecked, iter_collect_into)] #![feature(slice_swap_unchecked, iter_collect_into, push_mut)]
mod is_things { mod is_things {
/// True if `c` is considered a whitespace according to Rust language definition. /// True if `c` is considered a whitespace according to Rust language definition.
@ -91,47 +91,47 @@ macro_rules! tokens {
#[allow(dead_code)] #[allow(dead_code)]
#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)]
$vis enum $ty_name { $vis enum $ty_name<'a> {
$($name, $($name,
)* )*
$($name2,)* $($name2(&'a str),)*
} }
impl ::core::fmt::Display for $ty_name { impl ::core::fmt::Display for $ty_name<'_> {
fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
match self { match self {
$(Self::$name => write!(f, "{}", $lexeme),)* $(Self::$name => write!(f, "{}", $lexeme),)*
$(Self::$name2 => write!(f, "<{}>", stringify!($name2))),* $(Self::$name2(lexeme) => write!(f, "[{}: {lexeme}]", stringify!($name2))),*
} }
} }
} }
#[allow(dead_code)] #[allow(dead_code)]
impl $ty_name { impl $ty_name<'_> {
$vis fn lexeme(&self) -> Option<&'static str> { $vis fn lexeme(&'_ self) -> &'_ str {
match self { match self {
$(Self::$name => Some($lexeme),)* $(Self::$name => $lexeme,)*
$(Self::$name2 => None),* $(Self::$name2(lexeme) => lexeme),*
} }
} }
/// returns the number of chars in this lexeme /// returns the number of chars in this lexeme
$vis fn lexeme_len(&self) -> usize { $vis fn lexeme_len(&self) -> usize {
self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) self.lexeme().chars().count()
} }
/// returns the number of chars in this lexeme /// returns the number of chars in this lexeme
$vis fn lexeme_len_utf8(&self) -> usize { $vis fn lexeme_len_utf8(&self) -> usize {
self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0) self.lexeme().len()
} }
$vis fn maybe_ident(&self) -> bool { $vis fn maybe_ident(&self) -> bool {
self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false) crate::is_things::is_ident(self.lexeme())
} }
$vis fn lexemes() -> &'static [(Self, &'static str)] { $vis fn lexemes() -> &'static [(Token<'static>, &'static str)] {
&[ &[
$((Self::$name, $lexeme)),* $((Token::$name, $lexeme)),*
] ]
} }
} }
@ -159,8 +159,8 @@ tokens!(pub Token: {
}, },
// Lexical Tokens: // Lexical Tokens:
{ {
SlashSlash => "//", // SlashSlash => "//",
SlashSlashSlash => "///", // SlashSlashSlash => "///",
// SlashStar => "/*", // SlashStar => "/*",
// SlashStarStar => "/**", // SlashStarStar => "/**",
//StarSlash => "*/", //StarSlash => "*/",
@ -195,6 +195,7 @@ tokens!(pub Token: {
I32 => "i32", I32 => "i32",
I64 => "i64", I64 => "i64",
Const => "const", Const => "const",
Mutable => "mut",
Volatile => "volatile", Volatile => "volatile",
Noalias => "noalias", Noalias => "noalias",
Fn => "fn", Fn => "fn",
@ -211,6 +212,7 @@ tokens!(pub Token: {
Packed => "packed", Packed => "packed",
Extern => "extern", Extern => "extern",
Pub => "pub", Pub => "pub",
Module => "mod",
// Operators // Operators
Dot => ".", Dot => ".",
MinusGreater => "->", MinusGreater => "->",
@ -249,7 +251,7 @@ tokens!(pub Token: {
GreaterGreaterEqual => ">>=" GreaterGreaterEqual => ">>="
}); });
impl Token { impl Token<'_> {
pub fn is_assignment_op(self) -> bool { pub fn is_assignment_op(self) -> bool {
match self { match self {
Token::PlusEqual Token::PlusEqual
@ -297,13 +299,13 @@ impl Token {
} }
} }
use std::ops::Range; use std::{marker::PhantomData, ops::Range};
use trie::Tree; use trie::Tree;
#[derive(Debug, Clone, Copy)]
pub struct TokenItem<'a> { pub struct TokenItem<'a> {
pub token: Token, pub token: Token<'a>,
pub lexeme: &'a str,
pub offset: u32, pub offset: u32,
} }
@ -333,24 +335,39 @@ impl<I: Iterator> CharCountingIterator<I> {
} }
} }
impl<I: Iterator> core::ops::Deref for CharCountingIterator<I> { impl<I: Iterator<Item = char>> CharCountingIterator<core::iter::Peekable<I>> {
type Target = I; fn peek(&mut self) -> Option<&I::Item> {
self.iter.peek()
}
fn deref(&self) -> &Self::Target { fn next_if_eq(&mut self, expected: &I::Item) -> Option<I::Item>
&self.iter where
I::Item: PartialEq,
{
self.iter
.next_if_eq(expected)
.inspect(|c| self.count += c.len_utf8())
} }
} }
impl<I: Iterator> core::ops::DerefMut for CharCountingIterator<I> { // impl<I: Iterator> core::ops::Deref for CharCountingIterator<I> {
fn deref_mut(&mut self) -> &mut Self::Target { // type Target = I;
&mut self.iter
} // fn deref(&self) -> &Self::Target {
} // &self.iter
// }
// }
// impl<I: Iterator> core::ops::DerefMut for CharCountingIterator<I> {
// fn deref_mut(&mut self) -> &mut Self::Target {
// &mut self.iter
// }
// }
type Source<'a> = CharCountingIterator<core::iter::Peekable<core::str::Chars<'a>>>; type Source<'a> = CharCountingIterator<core::iter::Peekable<core::str::Chars<'a>>>;
pub struct TokenIterator<'a> { pub struct TokenIterator<'a> {
trie: Tree<char, Token>, trie: Tree<char, Token<'static>>,
source: &'a str, source: &'a str,
offset: usize, offset: usize,
} }
@ -374,7 +391,7 @@ impl<'a> TokenIterator<'a> {
CharCountingIterator::from(self.source[self.offset..].chars().peekable()) CharCountingIterator::from(self.source[self.offset..].chars().peekable())
} }
fn parse(&mut self) -> Option<Token> { fn parse(&mut self) -> Option<Token<'static>> {
let mut iter = CharCountingIterator::from(self.source[self.offset..].chars()); let mut iter = CharCountingIterator::from(self.source[self.offset..].chars());
match self.trie.get_closest(&mut iter) { match self.trie.get_closest(&mut iter) {
@ -416,7 +433,11 @@ impl<'a> TokenIterator<'a> {
count count
} }
fn next_token(&mut self) -> Option<(Token, Range<usize>)> { fn follows(&self, s: &str) -> bool {
self.source[self.offset..].starts_with(s)
}
fn next_token(&mut self) -> Option<(Token<'a>, Range<usize>)> {
// skip whitespace // skip whitespace
self.skip_whitespaces(); self.skip_whitespaces();
@ -429,19 +450,19 @@ impl<'a> TokenIterator<'a> {
let token = complex_tokens::parse_constant(&mut source).ok()?; let token = complex_tokens::parse_constant(&mut source).ok()?;
self.offset += source.offset(); self.offset += source.offset();
Some(token) Some((token, &self.source[start..self.offset]).into())
} }
Some('.') if cursor.next().map_or(false, is_things::is_digit) => { Some('.') if cursor.next().map_or(false, is_things::is_digit) => {
let token = complex_tokens::parse_constant(&mut source).ok()?; let token = complex_tokens::parse_constant(&mut source).ok()?;
self.offset += source.offset(); self.offset += source.offset();
Some(token) Some((token, &self.source[start..self.offset]).into())
} }
Some('\'' | '"') => { Some('\'' | '"') => {
let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?;
self.offset += source.offset(); self.offset += source.offset();
Some(token) Some((token, &self.source[start..self.offset]).into())
} }
Some('`') => { Some('`') => {
// raw identifier // raw identifier
@ -449,27 +470,32 @@ impl<'a> TokenIterator<'a> {
self.skip_while(|c| is_things::is_id_continue(c)); self.skip_while(|c| is_things::is_id_continue(c));
if self.peekable_source().next() == Some('`') { if self.peekable_source().next() == Some('`') {
self.skip(1); self.skip(1);
Some(Token::Ident) let lexeme = &self.source[start..self.offset];
Some(Token::Ident(lexeme))
} else { } else {
// unterminated raw identifier // unterminated raw identifier
Some(Token::ParseError) let lexeme = &self.source[start..self.offset];
Some(Token::ParseError(lexeme))
} }
} }
// `//`-style comments or doc-comments // `//`-style comments or doc-comments
_ => match self.parse().map(|tok| match tok { Some('/') if self.follows("//") => {
Token::SlashSlash => { let doc = complex_tokens::parse_comment(&mut source).ok()?;
self.skip_while(|c| c == '\n'); self.offset += source.offset();
Token::Comment eprintln!("next: {:?}", source.next());
eprintln!("rest: {:?}", &self.source[self.offset..]);
let lexeme = &self.source[start..self.offset];
if doc {
Some(Token::DocComment(lexeme))
} else {
Some(Token::Comment(lexeme))
} }
Token::SlashSlashSlash => { }
self.skip_while(|c| c == '\n'); _ => match self.parse() {
Token::DocComment
}
_ => tok,
}) {
Some(tok) => { Some(tok) => {
if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 {
Some(Token::Ident) Some(Token::Ident(&self.source[start..self.offset]))
} else { } else {
Some(tok) Some(tok)
} }
@ -482,7 +508,7 @@ impl<'a> TokenIterator<'a> {
{ {
self.skip(1); self.skip(1);
self.skip_while(|c| is_things::is_id_continue(c)); self.skip_while(|c| is_things::is_id_continue(c));
Some(Token::Ident) Some(Token::Ident(&self.source[start..self.offset]))
} else { } else {
None None
} }
@ -495,10 +521,8 @@ impl<'a> TokenIterator<'a> {
fn next_token_item(&mut self) -> Option<TokenItem<'a>> { fn next_token_item(&mut self) -> Option<TokenItem<'a>> {
let (token, range) = self.next_token()?; let (token, range) = self.next_token()?;
let lexeme = &self.source[range.clone()];
Some(TokenItem { Some(TokenItem {
token, token,
lexeme,
offset: range.start as u32, offset: range.start as u32,
}) })
} }
@ -509,7 +533,7 @@ impl<'a> TokenIterator<'a> {
} }
impl<'a> Iterator for TokenIterator<'a> { impl<'a> Iterator for TokenIterator<'a> {
type Item = Token; type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.next_token().map(|(token, _)| token) self.next_token().map(|(token, _)| token)
@ -528,6 +552,165 @@ impl<'a> Iterator for TokenItemIterator<'a> {
} }
} }
pub trait TokenConsumer<'a> {
type Product;
type Error;
fn try_consume_tokens<I: Iterator<Item = TokenItem<'a>> + Clone>(
&mut self,
iter: &mut I,
) -> Result<Self::Product, Self::Error>;
}
struct SimpleTokenConsumer<S, T: Default = ()>(S, PhantomData<T>);
impl<'a, S, T> TokenConsumer<'a> for SimpleTokenConsumer<S, T>
where
S: TokenSequence,
T: Default,
{
type Product = T;
type Error = ();
fn try_consume_tokens<I: Iterator<Item = TokenItem<'a>> + Clone>(
&mut self,
iter: &mut I,
) -> Result<Self::Product, Self::Error> {
let ref mut iter2 = iter.clone();
if iter2
.zip(self.0.tokens().iter().copied())
.all(|(item, expected)| item.token == expected)
{
core::mem::swap(iter, iter2);
Ok(T::default())
} else {
Err(())
}
}
}
struct TokenSequenceListConsumer<L: TokenSequenceList> {
list: L,
}
impl<'a, L: TokenSequenceList> TokenConsumer<'a> for TokenSequenceListConsumer<L> {
type Product = Vec<TokenItem<'a>>;
type Error = ();
fn try_consume_tokens<I: Iterator<Item = TokenItem<'a>> + Clone>(
&mut self,
iter: &mut I,
) -> Result<Self::Product, Self::Error> {
let sequences = self.list.iter_sequences();
for seq in sequences {
let mut iter2 = StealingIterator {
iter: iter.clone(),
yielded: Vec::new(),
};
if (&mut iter2)
.zip(seq.iter().copied())
.all(|(item, expected)| item.token == expected)
{
core::mem::swap(iter, &mut iter2.iter);
return Ok(iter2.yielded);
}
}
Err(())
}
}
struct StealingIterator<T, I: Iterator<Item = T>> {
pub iter: I,
pub yielded: Vec<T>,
}
impl<I, T> Iterator for StealingIterator<T, I>
where
T: Clone,
I: Iterator<Item = T>,
{
type Item = T;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(move |item| {
self.yielded.push(item.clone());
item
})
}
}
pub trait TokenSequence {
fn tokens(&'_ self) -> &'_ [Token<'_>];
}
impl TokenSequence for Token<'_> {
fn tokens(&'_ self) -> &'_ [Token<'_>] {
std::slice::from_ref(self)
}
}
impl TokenSequence for [Token<'_>] {
fn tokens(&'_ self) -> &'_ [Token<'_>] {
self
}
}
impl TokenSequence for &[Token<'_>] {
fn tokens(&'_ self) -> &'_ [Token<'_>] {
self
}
}
impl<const N: usize> TokenSequence for [Token<'_>; N] {
fn tokens(&'_ self) -> &'_ [Token<'_>] {
self
}
}
pub trait TokenSequenceList {
fn for_each(&mut self, f: impl FnMut(&dyn TokenSequence));
fn iter_sequences(&'_ self) -> impl Iterator<Item = &[Token<'_>]>;
fn first<T>(&mut self, pred: impl FnMut(&dyn TokenSequence) -> Option<T>) -> Option<T>;
}
impl<T: TokenSequence> TokenSequenceList for T {
fn for_each(&mut self, mut f: impl FnMut(&dyn TokenSequence)) {
f(self);
}
fn iter_sequences(&'_ self) -> impl Iterator<Item = &[Token<'_>]> {
std::iter::once(self.tokens())
}
fn first<U>(&mut self, mut pred: impl FnMut(&dyn TokenSequence) -> Option<U>) -> Option<U> {
pred(self)
}
}
macro_rules! impl_token_sequence_list {
($(($is:tt, $ts:ident)),*) => {
impl<$($ts,)*> $crate::TokenSequenceList for ($($ts,)*) where
$($ts: $crate::TokenSequenceList,)* {
fn for_each(&mut self, mut f: impl FnMut(&dyn $crate::TokenSequence)) {
$(self.$is.for_each(&mut f);)*
}
fn iter_sequences(&'_ self) -> impl Iterator<Item = &[Token<'_>]> {
std::iter::empty()
$(.chain(self.$is.iter_sequences()))*
}
fn first<U>(&mut self, mut pred: impl FnMut(&dyn $crate::TokenSequence) -> Option<U>) -> Option<U> {
$(
if let Some(res) = self.$is.first(&mut pred) {
return Some(res);
}
)*
None
}
}
};
}
variadics_please::all_tuples_enumerated!(impl_token_sequence_list, 1, 15, T);
mod complex_tokens; mod complex_tokens;
#[cfg(test)] #[cfg(test)]
@ -552,7 +735,7 @@ mod tests {
#[test] #[test]
fn idents() { fn idents() {
let mut lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123"); let mut lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123");
assert!(lexer.all(|tok| tok == Token::Ident)); assert!(lexer.all(|tok| matches!(tok, Token::Ident(_))));
} }
#[test] #[test]
@ -562,43 +745,61 @@ mod tests {
assert_eq!( assert_eq!(
tokens, tokens,
vec![ vec![
Token::Ident, Token::Ident("a-a"),
Token::Ident, Token::Ident("a-"),
Token::Minus, Token::Minus,
Token::Ident, Token::Ident("a"),
Token::Ident, Token::Ident("-a"),
Token::Ident Token::Ident("--a")
] ]
); );
} }
#[test]
fn comments() {
let mut lexer = TokenIterator::new(
r#"
// this is a comment
// spanning two lines
/// this is a doc comment"#,
);
assert_eq!(
lexer.next(),
Some(Token::Comment(
"// this is a comment\n// spanning two lines\n"
))
);
assert_eq!(
lexer.next(),
Some(Token::DocComment("/// this is a doc comment"))
);
}
#[test] #[test]
fn complex_iterator() { fn complex_iterator() {
let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }";
let lexer = TokenIterator::new(&tokens); let lexer = TokenIterator::new(&tokens);
let mut items = lexer let mut items = lexer.into_token_items().map(|item| item.token);
.into_token_items() assert_eq!(items.next(), Some(Token::Fn));
.map(|item| (item.token, item.lexeme)); assert_eq!(items.next(), Some(Token::Ident("my-function")));
assert_eq!(items.next(), Some((Token::Fn, "fn"))); assert_eq!(items.next(), Some(Token::OpenParens));
assert_eq!(items.next(), Some((Token::Ident, "my-function"))); assert_eq!(items.next(), Some(Token::Ident("x")));
assert_eq!(items.next(), Some((Token::OpenParens, "("))); assert_eq!(items.next(), Some(Token::Colon));
assert_eq!(items.next(), Some((Token::Ident, "x"))); assert_eq!(items.next(), Some(Token::I32));
assert_eq!(items.next(), Some((Token::Colon, ":"))); assert_eq!(items.next(), Some(Token::Comma));
assert_eq!(items.next(), Some((Token::I32, "i32"))); assert_eq!(items.next(), Some(Token::Ident("y")));
assert_eq!(items.next(), Some((Token::Comma, ","))); assert_eq!(items.next(), Some(Token::Colon));
assert_eq!(items.next(), Some((Token::Ident, "y"))); assert_eq!(items.next(), Some(Token::F32));
assert_eq!(items.next(), Some((Token::Colon, ":"))); assert_eq!(items.next(), Some(Token::CloseParens));
assert_eq!(items.next(), Some((Token::F32, "f32"))); assert_eq!(items.next(), Some(Token::MinusGreater));
assert_eq!(items.next(), Some((Token::CloseParens, ")"))); assert_eq!(items.next(), Some(Token::F32));
assert_eq!(items.next(), Some((Token::MinusGreater, "->"))); assert_eq!(items.next(), Some(Token::OpenBrace));
assert_eq!(items.next(), Some((Token::F32, "f32"))); assert_eq!(items.next(), Some(Token::Return));
assert_eq!(items.next(), Some((Token::OpenBrace, "{"))); assert_eq!(items.next(), Some(Token::Ident("x")));
assert_eq!(items.next(), Some((Token::Return, "return"))); assert_eq!(items.next(), Some(Token::Plus));
assert_eq!(items.next(), Some((Token::Ident, "x"))); assert_eq!(items.next(), Some(Token::Ident("y")));
assert_eq!(items.next(), Some((Token::Plus, "+"))); assert_eq!(items.next(), Some(Token::Semi));
assert_eq!(items.next(), Some((Token::Ident, "y"))); assert_eq!(items.next(), Some(Token::CloseBrace));
assert_eq!(items.next(), Some((Token::Semi, ";")));
assert_eq!(items.next(), Some((Token::CloseBrace, "}")));
assert_eq!(items.next(), None); assert_eq!(items.next(), None);
} }
} }

17
crates/parser/Cargo.toml Normal file
View file

@ -0,0 +1,17 @@
[package]
name = "parser"
version = "0.1.0"
edition = "2024"
[dependencies]
tracing = { workspace = true }
werkzeug = { workspace = true }
thiserror = { workspace = true }
itertools = { workspace = true }
internment = "0.8.6"
lexer = { path = "../lexer", version = "0.1.0" }
logos = "0.15"
pomelo = "0.2"

582
crates/parser/src/lib.rs Normal file
View file

@ -0,0 +1,582 @@
use internment::Intern;
use lexer::{Token, TokenConsumer, TokenItem, TokenItemIterator};
use logos::Logos;
use pomelo::pomelo;
use thiserror::Error;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum IntSize {
Bits(u16),
Pointer,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum InnerType {
Top,
Bottom,
Unit,
Bool,
Int {
signed: bool,
size: IntSize,
},
Float {
float_type: FloatType,
},
Pointer {
pointee: Box<Type>,
},
Array {
element: Box<Type>,
size: usize,
},
Function {
return_type: Box<Type>,
parameter_types: Vec<Type>,
},
Tuple {
elements: Vec<Type>,
},
TypeUnion {
types: Vec<Type>,
},
TypeIntersection {
types: Vec<Type>,
},
}
type Type = internment::Intern<InnerType>;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum FloatType {
F32,
F64,
}
#[derive(Debug, Clone)]
pub enum Value {
Bool(bool),
Int(i64),
UInt(u64),
Float(f64),
String(String),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ControlFlowKind {
Return,
Break,
Continue,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Index(u32);
#[derive(Debug)]
pub enum AstNode {
Root {
files: Vec<Index>,
},
File {
decls: Vec<Index>,
},
FunctionProto {
name: String,
return_type: Type,
parameter_list: Index,
},
ParameterList {
parameters: Vec<Index>,
},
Parameter {
name: String,
param_type: Type,
},
FunctionDecl(FunctionDecl),
Block {
statements: Vec<Index>,
expr: Option<Index>,
},
Constant {
ty: Type,
value: Value,
},
NoopExpr,
Stmt {
expr: Index,
},
ControlFlow {
kind: ControlFlowKind,
expr: Option<Index>,
},
VarDecl {
mutable: bool,
name: String,
var_type: Type,
},
Assignment {
dest: Index,
expr: Index,
},
GlobalDecl {
name: String,
var_type: Type,
value: Index,
},
StructDecl {
name: String,
fields: Vec<Index>,
},
FieldDecl {
name: String,
field_type: Type,
},
FieldAccess {
expr: Index,
field: String,
},
UnresolvedDeclRef {
name: String,
},
DeclRef {
decl: Index,
},
TypeDeclRef {
ty: Index,
},
ExplicitCast {
expr: Index,
ty: Type,
},
Deref {
expr: Index,
},
AddressOf {
expr: Index,
},
PlaceToValue {
expr: Index,
},
ValueToPlace {
expr: Index,
},
CallExpr {
callee: Index,
arguments: Vec<Index>,
},
Argument {
expr: Index,
},
Not(Index),
Negate(Index),
Multiply {
left: Index,
right: Index,
},
Divide {
left: Index,
right: Index,
},
Modulus {
left: Index,
right: Index,
},
Add {
left: Index,
right: Index,
},
Subtract {
left: Index,
right: Index,
},
BitOr {
left: Index,
right: Index,
},
BitAnd {
left: Index,
right: Index,
},
BitXor {
left: Index,
right: Index,
},
LogicalOr {
left: Index,
right: Index,
},
LogicalAnd {
left: Index,
right: Index,
},
Eq {
left: Index,
right: Index,
},
NotEq {
left: Index,
right: Index,
},
Less {
left: Index,
right: Index,
},
LessEq {
left: Index,
right: Index,
},
Greater {
left: Index,
right: Index,
},
GreaterEq {
left: Index,
right: Index,
},
ShiftLeft {
left: Index,
right: Index,
},
ShiftRight {
left: Index,
right: Index,
},
Subscript {
expr: Index,
index: Index,
},
If {
condition: Index,
then: Index,
r#else: Option<Index>,
},
Else {
expr: Index,
},
Comment {
text: String,
},
Attributes {
attrs: Vec<Index>,
},
Doc {
text: String,
},
Error {
err: Box<dyn core::error::Error>,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum Visibility {
#[default]
Private,
Public,
}
#[derive(Debug, Error)]
pub enum ParseError<'a> {
#[error("End of file.")]
EOF,
#[error("Unexpected token: {0:?}")]
UnexpectedToken(Token<'a>),
#[error("Not a type.")]
NotAType,
}
#[derive(Default, Debug)]
pub struct Ast {
nodes: Vec<AstNode>,
}
impl Ast {
pub fn new() -> Self {
Self::default()
}
pub fn push(&mut self, node: AstNode) -> Index {
let index = self.nodes.len() as u32;
self.nodes.push(node);
Index(index)
}
}
#[derive(Debug)]
struct FunctionDecl {
attrs: Option<Index>,
name: String,
visibility: Visibility,
return_type: Type,
parameter_list: Option<ParameterList>,
body: Index,
}
#[derive(Debug)]
struct Parameter {
mutable: bool,
name: String,
param_type: Type,
}
#[derive(Debug)]
struct ParameterList {
parameters: Vec<Index>,
}
#[derive(Debug)]
struct ExtraToken<'a> {
lexeme: &'a str,
offset: u32,
}
pomelo! {
%include {
use super::AstNode;
use super::{
Parameter, Ast, ParameterList, FunctionDecl, Type, InnerType,
FloatType, ExtraToken, Index, IntSize, Visibility,
};
};
%extra_argument Ast;
%parser pub struct Parser<'a>{};
%token #[derive(Debug)] pub enum Token<'a> {};
%type Ident &'a str;
%type DocComment &'a str;
%type Comment &'a str;
%type fn_decl FunctionDecl;
%type parameter Parameter;
%type parameter_list ParameterList;
%type typ Type;
%type return_type Type;
%type block Index;
%type decl Index;
%type decl_list Vec<Index>;
%type file Index;
file ::= decl_list?(list) {
let decls = list.unwrap_or_default();
extra.push(AstNode::File { decls })
};
decl_list ::= decl(decl) { vec![decl] };
decl_list ::= decl_list(dl) decl(decl) {
let mut list = dl;
list.push(decl);
list
};
%type attrs Index;
attrs ::= DocComment(text) {
let idx = extra.push(AstNode::Doc { text: text.to_string() });
extra.push(AstNode::Attributes { attrs: vec![idx] })
};
typ ::= Bool { internment::Intern::new(InnerType::Bool) };
typ ::= I1 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(1) }) };
typ ::= I8 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(8) }) };
typ ::= I16 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(16) }) };
typ ::= I32 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(32) }) };
typ ::= I64 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(64) }) };
typ ::= U1 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(1) }) };
typ ::= U8 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(8) }) };
typ ::= U16 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(16) }) };
typ ::= U32 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(32) }) };
typ ::= U64 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(64) }) };
typ ::= ISize { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Pointer }) };
typ ::= USize { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Pointer }) };
typ ::= F32 { internment::Intern::new(InnerType::Float { float_type: FloatType::F32 }) };
typ ::= F64 { internment::Intern::new(InnerType::Float { float_type: FloatType::F64 }) };
typ ::= Bang { internment::Intern::new(InnerType::Bottom) };
typ ::= unit { internment::Intern::new(InnerType::Unit) };
typ ::= Void { internment::Intern::new(InnerType::Unit) };
unit ::= LParen RParen;
%type expr Index;
%type stmt Index;
%type stmts Vec<Index>;
expr ::= { extra.push(AstNode::NoopExpr)};
stmt ::= expr(expr) Semi { extra.push(AstNode::Stmt { expr }) };
stmts ::= stmt(s) { vec![s] };
stmts ::= stmts(ss) stmt(s) {
let mut v = ss;
v.push(s);
v
};
block ::= LBrace stmts?(ss) RBrace {
extra.push(AstNode::Block {
statements: ss.unwrap_or_default(),
expr: None })
};
%type vis Visibility;
vis ::= Pub { Visibility::Public };
%type mutable bool;
mutable ::= Mutable { true };
mutable ::= { false };
return_type ::= Arrow typ(return_type) { return_type };
parameter ::= mutable(mutable) Ident(name) Colon typ(param_type) {
Parameter { mutable, name: name.to_string(), param_type }
};
parameter_list ::= parameter(p) {
let idx = extra.push(AstNode::Parameter { name: p.name, param_type: p.param_type });
ParameterList { parameters: vec![idx] }
};
parameter_list ::= parameter_list(pl) Comma parameter(p) {
let idx = extra.push(AstNode::Parameter { name: p.name, param_type: p.param_type });
let mut parameters = pl.parameters;
parameters.push(idx);
ParameterList { parameters }
};
parameter_list ::= parameter_list(pl) Comma {
pl
};
decl ::= Comment(text) { extra.push(AstNode::Comment { text: text.to_string() }) };
decl ::= fn_decl(f) { extra.push(AstNode::FunctionDecl(f)) };
fn_decl ::= attrs?(attrs) vis?(visibility) Fn Ident(name) LParen parameter_list?(parameters) RParen return_type(rtype) block(body) {
let name = name.to_string();
FunctionDecl {
attrs,
name,
visibility: visibility.unwrap_or_default(),
return_type: rtype,
parameter_list: parameters,
body,
}
};
}
impl<'a> From<lexer::Token<'a>> for parser::Token<'a> {
fn from(value: lexer::Token<'a>) -> Self {
use lexer::Token;
match value {
Token::Fn => Self::Fn,
Token::OpenParens => Self::LParen,
Token::CloseParens => Self::RParen,
Token::OpenBrace => Self::LBrace,
Token::CloseBrace => Self::RBrace,
Token::Ident(ident) => Self::Ident(ident),
Token::Comment(text) => Self::Comment(text),
Token::DocComment(text) => Self::DocComment(text),
Token::OpenSquareBracket => todo!(), // Self::LBracket,
Token::CloseSquareBracket => todo!(), // Self::RBracket,
Token::Comma => Self::Comma,
Token::Colon => Self::Colon,
Token::Semi => Self::Semi,
Token::Elipsis3 => todo!(),
Token::Elipsis2 => todo!(),
Token::Equal => todo!(),
Token::Void => Self::Void,
Token::Bool => Self::Bool,
Token::F32 => Self::F32,
Token::F64 => Self::F64,
Token::ISize => Self::ISize,
Token::USize => Self::USize,
Token::U1 => Self::U1,
Token::U8 => Self::U8,
Token::U16 => Self::U16,
Token::U32 => Self::U32,
Token::U64 => Self::U64,
Token::I1 => Self::I1,
Token::I8 => Self::I8,
Token::I16 => Self::I16,
Token::I32 => Self::I32,
Token::I64 => Self::I64,
Token::Const => todo!(), // Self::Const,
Token::Mutable => Self::Mutable,
Token::Volatile => todo!(),
Token::Noalias => todo!(),
Token::Let => todo!(),
Token::Var => todo!(),
Token::If => todo!(),
Token::As => todo!(),
Token::Else => todo!(),
Token::Return => todo!(),
Token::Struct => todo!(),
Token::Type => todo!(),
Token::Union => todo!(),
Token::Enum => todo!(),
Token::Packed => todo!(),
Token::Extern => todo!(),
Token::Pub => Self::Pub,
Token::Module => todo!(),
Token::Dot => todo!(),
Token::MinusGreater => Self::Arrow,
Token::Bang => Self::Bang,
Token::Tilde => todo!(),
Token::Plus => todo!(),
Token::Minus => todo!(),
Token::Star => todo!(),
Token::Slash => todo!(),
Token::Percent => todo!(),
Token::Less => todo!(),
Token::Greater => todo!(),
Token::LessEqual => todo!(),
Token::GreaterEqual => todo!(),
Token::EqualEqual => todo!(),
Token::BangEqual => todo!(),
Token::PipePipe => todo!(),
Token::AmpersandAmpersand => todo!(),
Token::Ampersand => todo!(),
Token::Caret => todo!(),
Token::Pipe => todo!(),
Token::LessLess => todo!(),
Token::GreaterGreater => todo!(),
Token::Question => todo!(),
Token::PlusEqual => todo!(),
Token::MinusEqual => todo!(),
Token::StarEqual => todo!(),
Token::SlashEqual => todo!(),
Token::PercentEqual => todo!(),
Token::AmpersandEqual => todo!(),
Token::PipeEqual => todo!(),
Token::CaretEqual => todo!(),
Token::LessLessEqual => todo!(),
Token::GreaterGreaterEqual => todo!(),
Token::Eof(_) => todo!(),
Token::ParseError(_) => todo!(),
Token::CharConstant(_) => todo!(),
Token::IntegerConstant(_) => todo!(),
Token::IntegerHexConstant(_) => todo!(),
Token::IntegerBinConstant(_) => todo!(),
Token::IntegerOctConstant(_) => todo!(),
Token::FloatingConstant(_) => todo!(),
Token::FloatingExpConstant(_) => todo!(),
Token::DotFloatingConstant(_) => todo!(),
Token::DotFloatingExpConstant(_) => todo!(),
Token::StringConstant(_) => todo!(),
}
}
}
#[cfg(test)]
mod tests {
use crate::AstNode;
#[test]
fn print_ast_node_size() {
eprintln!("Size of AstNode: {}", std::mem::size_of::<AstNode>());
}
#[test]
fn parse() {
use crate::parser::{Parser, Token};
let input = r#"
// A simple test case
/// A function that takes two u32 parameters and returns a u32
fn main(a: u32, b: u32) -> u32 {}
"#;
let mut lex = lexer::TokenIterator::new(input);
let mut mapped = lex.inspect(|t| eprintln!("{t:?}")).map(Token::from);
let mut ast = crate::Ast::new();
let mut parser = Parser::new(ast);
while let Some(token) = mapped.next() {
parser.parse(token).unwrap();
}
let (out, ast) = parser.end_of_input().unwrap();
eprintln!("AST: {:#?}", ast);
}
}