diff --git a/crates/lexer/src/complex_tokens.rs b/crates/lexer/src/complex_tokens.rs index 802f4ae..52ec7c2 100644 --- a/crates/lexer/src/complex_tokens.rs +++ b/crates/lexer/src/complex_tokens.rs @@ -63,23 +63,12 @@ impl Radix { Radix::Dec => 10, } } - fn to_token(self) -> Token { + fn to_constant_kind(self) -> ConstantKind { match self { - Radix::Hex => Token::IntegerHexConstant, - Radix::Bin => Token::IntegerBinConstant, - Radix::Oct => Token::IntegerOctConstant, - Radix::Dec => Token::IntegerConstant, - } - } - - #[expect(dead_code)] - pub fn from_token(token: Token) -> Option { - match token { - Token::IntegerHexConstant => Some(Radix::Hex), - Token::IntegerBinConstant => Some(Radix::Bin), - Token::IntegerOctConstant => Some(Radix::Oct), - Token::IntegerConstant => Some(Radix::Dec), - _ => None, + Radix::Hex => ConstantKind::HexInteger, + Radix::Bin => ConstantKind::BinInteger, + Radix::Oct => ConstantKind::OctInteger, + Radix::Dec => ConstantKind::Integer, } } @@ -236,7 +225,8 @@ fn try_parse_exp_part(source: &mut Source) -> Result> { // DEC_DIGITS FloatingType? // `.` DEC_DIGITS EXP_PART? FloatingType? // DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType? -fn parse_constant_inner(source: &mut Source) -> Result { +fn parse_constant_inner(source: &mut Source) -> Result { + let start = source.count; let zero = source.next_if(|&c| c == '0').is_some(); let radix = zero @@ -248,7 +238,7 @@ fn parse_constant_inner(source: &mut Source) -> Result { if source.peek().map(|&c| c == 'u' || c == 'i') == Some(true) { try_parse_integral_type(source)?; } - return Ok(radix.to_token()); + return Ok(radix.to_constant_kind()); } // if zero: `_`* DIGIT (DIGIT|`_`)* @@ -260,7 +250,7 @@ fn parse_constant_inner(source: &mut Source) -> Result { }?; if let Some(_) = source.try_map_iter_if(|source| try_parse_integral_type(source))? { - return Ok(Token::IntegerConstant); + return Ok(ConstantKind::Integer); } let dot = source.next_if(|&c| c == '.').is_some(); @@ -285,17 +275,48 @@ fn parse_constant_inner(source: &mut Source) -> Result { }; let token = match (dot, exp, floating) { - (false, false, false) => Token::IntegerConstant, - (true, false, _) => Token::DotFloatingConstant, - (true, true, _) => Token::DotFloatingExpConstant, - (false, true, _) => Token::FloatingExpConstant, - (false, _, _) => Token::FloatingConstant, + (false, false, false) => ConstantKind::Integer, + (true, false, _) => ConstantKind::DotFloating, + (true, true, _) => ConstantKind::DotFloatingExp, + (false, true, _) => ConstantKind::FloatingExp, + (false, _, _) => ConstantKind::Floating, }; Ok(token) } -pub(crate) fn parse_constant(source: &mut Source) -> Result { +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ConstantKind { + Integer, + BinInteger, + OctInteger, + HexInteger, + DotFloating, + DotFloatingExp, + FloatingExp, + Floating, + Char, + String, +} + +impl<'a> From<(ConstantKind, &'a str)> for Token<'a> { + fn from((value, lexeme): (ConstantKind, &'a str)) -> Self { + match value { + ConstantKind::Integer => Token::IntegerConstant(lexeme), + ConstantKind::BinInteger => Token::IntegerBinConstant(lexeme), + ConstantKind::OctInteger => Token::IntegerOctConstant(lexeme), + ConstantKind::HexInteger => Token::IntegerHexConstant(lexeme), + ConstantKind::DotFloating => Token::DotFloatingConstant(lexeme), + ConstantKind::DotFloatingExp => Token::DotFloatingExpConstant(lexeme), + ConstantKind::FloatingExp => Token::FloatingExpConstant(lexeme), + ConstantKind::Floating => Token::FloatingConstant(lexeme), + ConstantKind::Char => Token::CharConstant(lexeme), + ConstantKind::String => Token::StringConstant(lexeme), + } + } +} + +pub(crate) fn parse_constant(source: &mut Source) -> Result { let constant = parse_constant_inner(source)?; // char following a constant must not be id_continue if source @@ -309,7 +330,7 @@ pub(crate) fn parse_constant(source: &mut Source) -> Result { Ok(constant) } -pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result { +pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result { let quote = source .next_if(|&c| c == '"' || c == '\'') .ok_or(Error::InvalidToken)?; @@ -340,15 +361,64 @@ pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result(source: &'a mut Source) -> Result { + if !(source.next() == Some('/') && source.next() == Some('/')) { + return Err(Error::InvalidToken); + } + + let doc = source.next_if_eq(&'/').is_some(); + eprintln!("doc comment: {doc}"); + loop { + // take until new line + source + .take_while_inclusive(|&c| c != '\n') + .inspect(|c| eprintln!("skipping comment char: {c}")) + .for_each(drop); + + let mut copy = source.clone(); + // skip whitespaces after new line to find continuation of comment + (&mut copy) + .take_while_ref(|&c| { + eprintln!("Skipping whitespace: {c}"); + is_things::is_whitespace(c) && c != '\n' + }) + .for_each(drop); + + if (copy.next() == Some('/')) && (copy.next() == Some('/')) { + match copy.next() { + None => break, + // docs end here, regular comment starts + Some('\n') if doc => break, + // this is a comment, so we can just take until this new line + Some('\n') if !doc => continue, + // continue doc comment + Some('/') if doc => {} + Some('/') if !doc => break, + Some(_) if doc => break, + // continue regular comment + Some(_) => {} + } + + *source = copy; + } else { + break; + } + } + Ok(doc) +} + #[cfg(test)] mod tests { + use crate::complex_tokens::parse_comment; + use super::*; fn make_source(s: &'_ str) -> Source<'_> { @@ -359,36 +429,36 @@ mod tests { fn parse_constant_number() { assert_eq!( parse_constant(&mut make_source("0x1A3F_u32")), - Ok(Token::IntegerHexConstant) + Ok(ConstantKind::HexInteger) ); assert_eq!( parse_constant(&mut make_source("13f32")), - Ok(Token::FloatingConstant) + Ok(ConstantKind::Floating) ); assert_eq!( parse_constant(&mut make_source("0b1011_0010i16")), - Ok(Token::IntegerBinConstant) + Ok(ConstantKind::BinInteger) ); assert_eq!( parse_constant(&mut make_source("0o755u8")), - Ok(Token::IntegerOctConstant) + Ok(ConstantKind::OctInteger) ); assert_eq!( parse_constant(&mut make_source("42i64")), - Ok(Token::IntegerConstant) + Ok(ConstantKind::Integer) ); assert_eq!( parse_constant(&mut make_source("3.14f64")), - Ok(Token::DotFloatingConstant) + Ok(ConstantKind::DotFloating) ); assert_eq!( parse_constant(&mut make_source("2.71828e0f32")), - Ok(Token::DotFloatingExpConstant) + Ok(ConstantKind::DotFloatingExp) ); assert_eq!( parse_constant(&mut make_source("22e23")), - Ok(Token::FloatingExpConstant) + Ok(ConstantKind::FloatingExp) ); } } diff --git a/crates/lexer/src/lib.rs b/crates/lexer/src/lib.rs index dfcbb9b..2b996ea 100644 --- a/crates/lexer/src/lib.rs +++ b/crates/lexer/src/lib.rs @@ -91,47 +91,47 @@ macro_rules! tokens { #[allow(dead_code)] #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] - $vis enum $ty_name { + $vis enum $ty_name<'a> { $($name, )* - $($name2,)* + $($name2(&'a str),)* } - impl ::core::fmt::Display for $ty_name { + impl ::core::fmt::Display for $ty_name<'_> { fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { match self { $(Self::$name => write!(f, "{}", $lexeme),)* - $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),* + $(Self::$name2(lexeme) => write!(f, "[{}: {lexeme}]", stringify!($name2))),* } } } #[allow(dead_code)] - impl $ty_name { - $vis fn lexeme(&self) -> Option<&'static str> { + impl $ty_name<'_> { + $vis fn lexeme(&'_ self) -> &'_ str { match self { - $(Self::$name => Some($lexeme),)* - $(Self::$name2 => None),* + $(Self::$name => $lexeme,)* + $(Self::$name2(lexeme) => lexeme),* } } /// returns the number of chars in this lexeme $vis fn lexeme_len(&self) -> usize { - self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) + self.lexeme().chars().count() } /// returns the number of chars in this lexeme $vis fn lexeme_len_utf8(&self) -> usize { - self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0) + self.lexeme().len() } $vis fn maybe_ident(&self) -> bool { - self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false) + crate::is_things::is_ident(self.lexeme()) } - $vis fn lexemes() -> &'static [(Self, &'static str)] { + $vis fn lexemes() -> &'static [(Token<'static>, &'static str)] { &[ - $((Self::$name, $lexeme)),* + $((Token::$name, $lexeme)),* ] } } @@ -159,8 +159,8 @@ tokens!(pub Token: { }, // Lexical Tokens: { - SlashSlash => "//", - SlashSlashSlash => "///", + // SlashSlash => "//", + // SlashSlashSlash => "///", // SlashStar => "/*", // SlashStarStar => "/**", //StarSlash => "*/", @@ -251,7 +251,7 @@ tokens!(pub Token: { GreaterGreaterEqual => ">>=" }); -impl Token { +impl Token<'_> { pub fn is_assignment_op(self) -> bool { match self { Token::PlusEqual @@ -299,18 +299,13 @@ impl Token { } } -use std::{ - collections::VecDeque, - marker::PhantomData, - ops::{Deref, DerefMut, Range}, -}; +use std::{marker::PhantomData, ops::Range}; use trie::Tree; #[derive(Debug, Clone, Copy)] pub struct TokenItem<'a> { - pub token: Token, - pub lexeme: &'a str, + pub token: Token<'a>, pub offset: u32, } @@ -340,24 +335,39 @@ impl CharCountingIterator { } } -impl core::ops::Deref for CharCountingIterator { - type Target = I; +impl> CharCountingIterator> { + fn peek(&mut self) -> Option<&I::Item> { + self.iter.peek() + } - fn deref(&self) -> &Self::Target { - &self.iter + fn next_if_eq(&mut self, expected: &I::Item) -> Option + where + I::Item: PartialEq, + { + self.iter + .next_if_eq(expected) + .inspect(|c| self.count += c.len_utf8()) } } -impl core::ops::DerefMut for CharCountingIterator { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.iter - } -} +// impl core::ops::Deref for CharCountingIterator { +// type Target = I; + +// fn deref(&self) -> &Self::Target { +// &self.iter +// } +// } + +// impl core::ops::DerefMut for CharCountingIterator { +// fn deref_mut(&mut self) -> &mut Self::Target { +// &mut self.iter +// } +// } type Source<'a> = CharCountingIterator>>; pub struct TokenIterator<'a> { - trie: Tree, + trie: Tree>, source: &'a str, offset: usize, } @@ -381,7 +391,7 @@ impl<'a> TokenIterator<'a> { CharCountingIterator::from(self.source[self.offset..].chars().peekable()) } - fn parse(&mut self) -> Option { + fn parse(&mut self) -> Option> { let mut iter = CharCountingIterator::from(self.source[self.offset..].chars()); match self.trie.get_closest(&mut iter) { @@ -423,7 +433,11 @@ impl<'a> TokenIterator<'a> { count } - fn next_token(&mut self) -> Option<(Token, Range)> { + fn follows(&self, s: &str) -> bool { + self.source[self.offset..].starts_with(s) + } + + fn next_token(&mut self) -> Option<(Token<'a>, Range)> { // skip whitespace self.skip_whitespaces(); @@ -436,19 +450,19 @@ impl<'a> TokenIterator<'a> { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); - Some(token) + Some((token, &self.source[start..self.offset]).into()) } Some('.') if cursor.next().map_or(false, is_things::is_digit) => { let token = complex_tokens::parse_constant(&mut source).ok()?; self.offset += source.offset(); - Some(token) + Some((token, &self.source[start..self.offset]).into()) } Some('\'' | '"') => { let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?; self.offset += source.offset(); - Some(token) + Some((token, &self.source[start..self.offset]).into()) } Some('`') => { // raw identifier @@ -456,27 +470,32 @@ impl<'a> TokenIterator<'a> { self.skip_while(|c| is_things::is_id_continue(c)); if self.peekable_source().next() == Some('`') { self.skip(1); - Some(Token::Ident) + let lexeme = &self.source[start..self.offset]; + Some(Token::Ident(lexeme)) } else { // unterminated raw identifier - Some(Token::ParseError) + let lexeme = &self.source[start..self.offset]; + Some(Token::ParseError(lexeme)) } } // `//`-style comments or doc-comments - _ => match self.parse().map(|tok| match tok { - Token::SlashSlash => { - self.skip_while(|c| c == '\n'); - Token::Comment + Some('/') if self.follows("//") => { + let doc = complex_tokens::parse_comment(&mut source).ok()?; + self.offset += source.offset(); + eprintln!("next: {:?}", source.next()); + eprintln!("rest: {:?}", &self.source[self.offset..]); + + let lexeme = &self.source[start..self.offset]; + if doc { + Some(Token::DocComment(lexeme)) + } else { + Some(Token::Comment(lexeme)) } - Token::SlashSlashSlash => { - self.skip_while(|c| c == '\n'); - Token::DocComment - } - _ => tok, - }) { + } + _ => match self.parse() { Some(tok) => { if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 { - Some(Token::Ident) + Some(Token::Ident(&self.source[start..self.offset])) } else { Some(tok) } @@ -489,7 +508,7 @@ impl<'a> TokenIterator<'a> { { self.skip(1); self.skip_while(|c| is_things::is_id_continue(c)); - Some(Token::Ident) + Some(Token::Ident(&self.source[start..self.offset])) } else { None } @@ -502,10 +521,8 @@ impl<'a> TokenIterator<'a> { fn next_token_item(&mut self) -> Option> { let (token, range) = self.next_token()?; - let lexeme = &self.source[range.clone()]; Some(TokenItem { token, - lexeme, offset: range.start as u32, }) } @@ -516,7 +533,7 @@ impl<'a> TokenIterator<'a> { } impl<'a> Iterator for TokenIterator<'a> { - type Item = Token; + type Item = Token<'a>; fn next(&mut self) -> Option { self.next_token().map(|(token, _)| token) @@ -535,115 +552,6 @@ impl<'a> Iterator for TokenItemIterator<'a> { } } -pub struct Peeking; -pub struct Consuming; -pub trait ReborrowMode: sealed::Sealed {} -impl ReborrowMode for Peeking {} -impl ReborrowMode for Consuming {} - -mod sealed { - pub trait Sealed {} - impl Sealed for super::Peeking {} - impl Sealed for super::Consuming {} -} - -enum Queue<'a, T> { - Owned(VecDeque), - Borrowed(&'a mut VecDeque), -} - -impl<'a, T> Queue<'a, T> { - fn borrowed(&'_ mut self) -> Queue<'_, T> { - match self { - Queue::Owned(v) => Queue::Borrowed(v), - Queue::Borrowed(v) => Queue::Borrowed(v), - } - } -} - -impl Deref for Queue<'_, T> { - type Target = VecDeque; - - fn deref(&self) -> &Self::Target { - match self { - Queue::Owned(v) => v, - Queue::Borrowed(v) => v, - } - } -} - -impl DerefMut for Queue<'_, T> { - fn deref_mut(&mut self) -> &mut Self::Target { - match self { - Queue::Owned(v) => v, - Queue::Borrowed(v) => v, - } - } -} - -pub struct ReborrowingIterator<'a, 'b, I, T, Marker> -where - I: Iterator, -{ - iter: &'a mut I, - cache: Queue<'b, T>, - peeking_cursor: usize, - _marker: PhantomData, -} - -pub type ReborrowingPeekingIterator<'a, 'b, I, T> = ReborrowingIterator<'a, 'b, I, T, Peeking>; -pub type ReborrowingConsumingIterator<'a, 'b, I, T> = ReborrowingIterator<'a, 'b, I, T, Consuming>; - -impl<'a, 'b, I, T, Marker> ReborrowingIterator<'a, 'b, I, T, Marker> -where - I: Iterator, -{ - pub fn new(iter: &'a mut I) -> Self { - Self { - iter, - cache: Queue::Owned(VecDeque::new()), - peeking_cursor: 0, - _marker: PhantomData, - } - } - - pub fn reborrow_peeking(self) -> ReborrowingIterator<'a, 'b, I, T, Peeking> { - ReborrowingIterator { - iter: self.iter, - cache: self.cache, - peeking_cursor: 0, - _marker: PhantomData, - } - } - - pub fn reborrow_consuming(self) -> ReborrowingIterator<'a, 'b, I, T, Consuming> { - ReborrowingIterator { - iter: self.iter, - cache: self.cache, - peeking_cursor: 0, - _marker: PhantomData, - } - } - - pub fn borrow_peeking(&'_ mut self) -> ReborrowingIterator<'_, '_, I, T, Peeking> { - ReborrowingIterator { - iter: self.iter, - cache: self.cache.borrowed(), - peeking_cursor: 0, - _marker: PhantomData, - } - } - - pub fn borrow_consuming(&'_ mut self) -> ReborrowingIterator<'_, '_, I, T, Consuming> { - ReborrowingIterator { - iter: self.iter, - cache: self.cache.borrowed(), - peeking_cursor: 0, - _marker: PhantomData, - } - } -} - pub trait TokenConsumer<'a> { type Product; type Error; @@ -732,34 +640,34 @@ where } pub trait TokenSequence { - fn tokens(&'_ self) -> &'_ [Token]; + fn tokens(&'_ self) -> &'_ [Token<'_>]; } -impl TokenSequence for Token { - fn tokens(&'_ self) -> &'_ [Token] { +impl TokenSequence for Token<'_> { + fn tokens(&'_ self) -> &'_ [Token<'_>] { std::slice::from_ref(self) } } -impl TokenSequence for [Token] { - fn tokens(&'_ self) -> &'_ [Token] { +impl TokenSequence for [Token<'_>] { + fn tokens(&'_ self) -> &'_ [Token<'_>] { self } } -impl TokenSequence for &[Token] { - fn tokens(&'_ self) -> &'_ [Token] { +impl TokenSequence for &[Token<'_>] { + fn tokens(&'_ self) -> &'_ [Token<'_>] { self } } -impl TokenSequence for [Token; N] { - fn tokens(&'_ self) -> &'_ [Token] { +impl TokenSequence for [Token<'_>; N] { + fn tokens(&'_ self) -> &'_ [Token<'_>] { self } } pub trait TokenSequenceList { fn for_each(&mut self, f: impl FnMut(&dyn TokenSequence)); - fn iter_sequences(&self) -> impl Iterator; + fn iter_sequences(&'_ self) -> impl Iterator]>; fn first(&mut self, pred: impl FnMut(&dyn TokenSequence) -> Option) -> Option; } impl TokenSequenceList for T { @@ -767,7 +675,7 @@ impl TokenSequenceList for T { f(self); } - fn iter_sequences(&self) -> impl Iterator { + fn iter_sequences(&'_ self) -> impl Iterator]> { std::iter::once(self.tokens()) } @@ -785,7 +693,7 @@ macro_rules! impl_token_sequence_list { $(self.$is.for_each(&mut f);)* } - fn iter_sequences(&self) -> impl Iterator { + fn iter_sequences(&'_ self) -> impl Iterator]> { std::iter::empty() $(.chain(self.$is.iter_sequences()))* } @@ -803,151 +711,6 @@ macro_rules! impl_token_sequence_list { } variadics_please::all_tuples_enumerated!(impl_token_sequence_list, 1, 15, T); - -impl<'a, 'b, I> ReborrowingIterator<'a, 'b, I, TokenItem<'a>, Consuming> -where - I: Iterator>, -{ - pub fn expect_one_of>( - &mut self, - candidates: Ts, - ) -> Option> { - let mut candidates = candidates.into_iter(); - - let item = self.next()?; - if candidates.any(|cand| cand == item.token) { - Some(item) - } else { - None - } - } - - pub fn expect_sequence( - &mut self, - sequence: &S, - ) -> Option>> { - let ref mut peeking = self.borrow_peeking(); - - // check that the next tokens match the expected sequence - let matches = sequence - .tokens() - .into_iter() - .copied() - .zip(peeking.map(|item| item.token)) - .all(|(a, b)| a == b); - if matches { - Some(peeking.drain_peeked().collect()) - } else { - None - } - } - - pub fn expect_sequence_list(&mut self, mut list: L) { - list.first(|s| self.expect_sequence(s)); - } -} - -impl<'a, 'b, I, T> Iterator for ReborrowingIterator<'a, 'b, I, T, Consuming> -where - I: Iterator, -{ - type Item = T; - - fn next(&mut self) -> Option { - self.cache.pop_front().or_else(|| self.iter.next()) - } -} - -impl<'a, 'b, I, T> Iterator for ReborrowingIterator<'a, 'b, I, T, Peeking> -where - I: Iterator, - T: Copy, -{ - type Item = T; - - fn next(&mut self) -> Option { - self.peek_next().copied() - } -} - -impl<'a, 'b, I, T> ReborrowingIterator<'a, 'b, I, T, Peeking> -where - I: Iterator, -{ - pub fn peek_next(&mut self) -> Option<&T> { - if self.peeking_cursor >= self.cache.len() { - if let Some(item) = self.iter.next() { - self.peeking_cursor += 1; - Some(self.cache.push_back_mut(item)) - } else { - None - } - } else { - let item = self.cache.get(self.peeking_cursor)?; - self.peeking_cursor += 1; - Some(item) - } - } - - pub fn drain_peeked(&mut self) -> impl Iterator + '_ { - let drained = self.cache.drain(0..self.peeking_cursor); - self.peeking_cursor = 0; - drained - } - - pub fn skip(&mut self, n: usize) { - let cached = self.cache.len() - self.peeking_cursor; - self.peeking_cursor = self.peeking_cursor.saturating_add(n); - if n > cached { - // need to pull from the underlying iterator - let surplus = n - cached; - self.cache.extend(self.iter.take(surplus)); - self.peeking_cursor += n; - } - } - - pub fn borrow_consuming_at_cursor( - &'_ mut self, - ) -> ReborrowingIterator<'_, '_, I, T, Consuming> { - _ = self.drain_peeked(); - ReborrowingIterator { - iter: self.iter, - cache: self.cache.borrowed(), - peeking_cursor: 0, - _marker: PhantomData, - } - } - - pub fn reborrow_consuming_at_cursor(mut self) -> ReborrowingIterator<'a, 'b, I, T, Consuming> { - _ = self.drain_peeked(); - ReborrowingIterator { - iter: self.iter, - cache: self.cache, - peeking_cursor: 0, - _marker: PhantomData, - } - } -} - -impl<'a, 'b, I> ReborrowingIterator<'a, 'b, I, TokenItem<'a>, Peeking> -where - I: Iterator>, -{ - pub fn peek_one_of>( - &mut self, - candidates: Ts, - ) -> Option> { - let mut candidates = candidates.into_iter(); - - let item = self.peek_next()?; - if candidates.any(|cand| cand == item.token) { - Some(*item) - } else { - None - } - } -} - mod complex_tokens; #[cfg(test)] @@ -972,7 +735,7 @@ mod tests { #[test] fn idents() { let mut lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123"); - assert!(lexer.all(|tok| tok == Token::Ident)); + assert!(lexer.all(|tok| matches!(tok, Token::Ident(_)))); } #[test] @@ -982,43 +745,61 @@ mod tests { assert_eq!( tokens, vec![ - Token::Ident, - Token::Ident, + Token::Ident("a-a"), + Token::Ident("a-"), Token::Minus, - Token::Ident, - Token::Ident, - Token::Ident + Token::Ident("a"), + Token::Ident("-a"), + Token::Ident("--a") ] ); } + #[test] + fn comments() { + let mut lexer = TokenIterator::new( + r#" +// this is a comment +// spanning two lines +/// this is a doc comment"#, + ); + assert_eq!( + lexer.next(), + Some(Token::Comment( + "// this is a comment\n// spanning two lines\n" + )) + ); + assert_eq!( + lexer.next(), + Some(Token::DocComment("/// this is a doc comment")) + ); + } + #[test] fn complex_iterator() { let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }"; let lexer = TokenIterator::new(&tokens); - let mut items = lexer - .into_token_items() - .map(|item| (item.token, item.lexeme)); - assert_eq!(items.next(), Some((Token::Fn, "fn"))); - assert_eq!(items.next(), Some((Token::Ident, "my-function"))); - assert_eq!(items.next(), Some((Token::OpenParens, "("))); - assert_eq!(items.next(), Some((Token::Ident, "x"))); - assert_eq!(items.next(), Some((Token::Colon, ":"))); - assert_eq!(items.next(), Some((Token::I32, "i32"))); - assert_eq!(items.next(), Some((Token::Comma, ","))); - assert_eq!(items.next(), Some((Token::Ident, "y"))); - assert_eq!(items.next(), Some((Token::Colon, ":"))); - assert_eq!(items.next(), Some((Token::F32, "f32"))); - assert_eq!(items.next(), Some((Token::CloseParens, ")"))); - assert_eq!(items.next(), Some((Token::MinusGreater, "->"))); - assert_eq!(items.next(), Some((Token::F32, "f32"))); - assert_eq!(items.next(), Some((Token::OpenBrace, "{"))); - assert_eq!(items.next(), Some((Token::Return, "return"))); - assert_eq!(items.next(), Some((Token::Ident, "x"))); - assert_eq!(items.next(), Some((Token::Plus, "+"))); - assert_eq!(items.next(), Some((Token::Ident, "y"))); - assert_eq!(items.next(), Some((Token::Semi, ";"))); - assert_eq!(items.next(), Some((Token::CloseBrace, "}"))); + let mut items = lexer.into_token_items().map(|item| item.token); + assert_eq!(items.next(), Some(Token::Fn)); + assert_eq!(items.next(), Some(Token::Ident("my-function"))); + assert_eq!(items.next(), Some(Token::OpenParens)); + assert_eq!(items.next(), Some(Token::Ident("x"))); + assert_eq!(items.next(), Some(Token::Colon)); + assert_eq!(items.next(), Some(Token::I32)); + assert_eq!(items.next(), Some(Token::Comma)); + assert_eq!(items.next(), Some(Token::Ident("y"))); + assert_eq!(items.next(), Some(Token::Colon)); + assert_eq!(items.next(), Some(Token::F32)); + assert_eq!(items.next(), Some(Token::CloseParens)); + assert_eq!(items.next(), Some(Token::MinusGreater)); + assert_eq!(items.next(), Some(Token::F32)); + assert_eq!(items.next(), Some(Token::OpenBrace)); + assert_eq!(items.next(), Some(Token::Return)); + assert_eq!(items.next(), Some(Token::Ident("x"))); + assert_eq!(items.next(), Some(Token::Plus)); + assert_eq!(items.next(), Some(Token::Ident("y"))); + assert_eq!(items.next(), Some(Token::Semi)); + assert_eq!(items.next(), Some(Token::CloseBrace)); assert_eq!(items.next(), None); } } diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 885462d..395da7d 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -1,12 +1,15 @@ use internment::Intern; -use lexer::{ - Consuming, ReborrowingConsumingIterator, ReborrowingIterator, ReborrowingPeekingIterator, - Token, TokenConsumer, TokenItem, TokenItemIterator, -}; +use lexer::{Token, TokenConsumer, TokenItem, TokenItemIterator}; use logos::Logos; use pomelo::pomelo; use thiserror::Error; +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum IntSize { + Bits(u16), + Pointer, +} + #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum InnerType { Top, @@ -15,7 +18,7 @@ pub enum InnerType { Bool, Int { signed: bool, - bits: u8, + size: IntSize, }, Float { float_type: FloatType, @@ -98,7 +101,8 @@ pub enum AstNode { ty: Type, value: Value, }, - ExpressionStatement { + NoopExpr, + Stmt { expr: Index, }, ControlFlow { @@ -249,17 +253,33 @@ pub enum AstNode { Else { expr: Index, }, + Comment { + text: String, + }, + Attributes { + attrs: Vec, + }, + Doc { + text: String, + }, Error { err: Box, }, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub enum Visibility { + #[default] + Private, + Public, +} + #[derive(Debug, Error)] -pub enum ParseError { +pub enum ParseError<'a> { #[error("End of file.")] EOF, #[error("Unexpected token: {0:?}")] - UnexpectedToken(Token), + UnexpectedToken(Token<'a>), #[error("Not a type.")] NotAType, } @@ -283,6 +303,7 @@ impl Ast { #[derive(Debug)] struct FunctionDecl { name: String, + visibility: Visibility, return_type: Type, parameter_list: Option, body: Index, @@ -290,6 +311,7 @@ struct FunctionDecl { #[derive(Debug)] struct Parameter { + mutable: bool, name: String, param_type: Type, } @@ -310,41 +332,86 @@ pomelo! { use super::AstNode; use super::{ Parameter, Ast, ParameterList, FunctionDecl, Type, InnerType, - FloatType, ExtraToken, Index, + FloatType, ExtraToken, Index, IntSize, Visibility, }; }; %extra_argument Ast; %parser pub struct Parser<'a>{}; - %extra_token (&'a str, u32); + %token #[derive(Debug)] pub enum Token<'a> {}; + // %default_type &'a str; + %type Ident &'a str; + %type DocComment &'a str; + %type Comment &'a str; %type fn_decl FunctionDecl; %type parameter Parameter; %type parameter_list ParameterList; %type typ Type; %type return_type Type; %type block Index; + %type decl Index; + %type decl_list Vec; + %type file Index; - file ::= decl_list?; - decl_list ::= decl; - decl_list ::= decl_list decl; + file ::= decl_list?(list) { + let decls = list.unwrap_or_default(); + extra.push(AstNode::File { decls }) + }; + decl_list ::= decl(decl) { vec![decl] }; + decl_list ::= decl_list(dl) decl(decl) { + let mut list = dl; + list.push(decl); + list + }; typ ::= Bool { internment::Intern::new(InnerType::Bool) }; - typ ::= I8 { internment::Intern::new(InnerType::Int { signed: true, bits: 8 }) }; - typ ::= I16 { internment::Intern::new(InnerType::Int { signed: true, bits: 16 }) }; - typ ::= I32 { internment::Intern::new(InnerType::Int { signed: true, bits: 32 }) }; - typ ::= I64 { internment::Intern::new(InnerType::Int { signed: true, bits: 64 }) }; - typ ::= U8 { internment::Intern::new(InnerType::Int { signed: false, bits: 8 }) }; - typ ::= U16 { internment::Intern::new(InnerType::Int { signed: false, bits: 16 }) }; - typ ::= U32 { internment::Intern::new(InnerType::Int { signed: false, bits: 32 }) }; - typ ::= U64 { internment::Intern::new(InnerType::Int { signed: false, bits: 64 }) }; + typ ::= I1 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(1) }) }; + typ ::= I8 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(8) }) }; + typ ::= I16 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(16) }) }; + typ ::= I32 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(32) }) }; + typ ::= I64 { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Bits(64) }) }; + typ ::= U1 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(1) }) }; + typ ::= U8 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(8) }) }; + typ ::= U16 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(16) }) }; + typ ::= U32 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(32) }) }; + typ ::= U64 { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Bits(64) }) }; + typ ::= ISize { internment::Intern::new(InnerType::Int { signed: true, size: IntSize::Pointer }) }; + typ ::= USize { internment::Intern::new(InnerType::Int { signed: false, size: IntSize::Pointer }) }; typ ::= F32 { internment::Intern::new(InnerType::Float { float_type: FloatType::F32 }) }; typ ::= F64 { internment::Intern::new(InnerType::Float { float_type: FloatType::F64 }) }; typ ::= Bang { internment::Intern::new(InnerType::Bottom) }; - typ ::= LParen RParen { internment::Intern::new(InnerType::Unit) }; + typ ::= unit { internment::Intern::new(InnerType::Unit) }; + typ ::= Void { internment::Intern::new(InnerType::Unit) }; + + unit ::= LParen RParen; + + %type expr Index; + %type stmt Index; + %type stmts Vec; + expr ::= { extra.push(AstNode::NoopExpr)}; + stmt ::= expr(expr) Semi { extra.push(AstNode::Stmt { expr }) }; + + stmts ::= stmt(s) { vec![s] }; + stmts ::= stmts(ss) stmt(s) { + let mut v = ss; + v.push(s); + v + }; + block ::= LBrace stmts?(ss) RBrace { + extra.push(AstNode::Block { + statements: ss.unwrap_or_default(), + expr: None }) + }; + + %type vis Visibility; + vis ::= Pub { Visibility::Public }; + + %type mutable bool; + mutable ::= Mutable { true }; + mutable ::= { false }; return_type ::= Arrow typ(return_type) { return_type }; - block ::= LBrace RBrace { extra.push(AstNode::Block { statements: vec![], expr: None }) }; - parameter ::= Ident(name) Colon typ(param_type) { - Parameter { name: name.0.to_string(), param_type } + parameter ::= mutable(mutable) Ident(name) Colon typ(param_type) { + Parameter { mutable, name: name.to_string(), param_type } }; parameter_list ::= parameter(p) { let idx = extra.push(AstNode::Parameter { name: p.name, param_type: p.param_type }); @@ -360,11 +427,12 @@ pomelo! { pl }; - decl ::= fn_decl(f) { extra.nodes.push(AstNode::FunctionDecl(f)); }; - fn_decl ::= Fn Ident(name) LParen parameter_list?(parameters) RParen return_type(rtype) block(body) { - let name = name.0.to_string(); + decl ::= fn_decl(f) { extra.push(AstNode::FunctionDecl(f)) }; + fn_decl ::= vis?(visibility) Fn Ident(name) LParen parameter_list?(parameters) RParen return_type(rtype) block(body) { + let name = name.to_string(); FunctionDecl { name, + visibility: visibility.unwrap_or_default(), return_type: rtype, parameter_list: parameters, body, @@ -372,6 +440,109 @@ pomelo! { }; } +impl<'a> From> for parser::Token<'a> { + fn from(value: lexer::Token<'a>) -> Self { + use lexer::Token; + match value { + Token::Fn => Self::Fn, + Token::OpenParens => Self::LParen, + Token::CloseParens => Self::RParen, + Token::OpenBrace => Self::LBrace, + Token::CloseBrace => Self::RBrace, + Token::Ident(ident) => Self::Ident(ident), + Token::Comment(text) => Self::Comment(text), + Token::DocComment(text) => Self::DocComment(text), + Token::OpenSquareBracket => todo!(), // Self::LBracket, + Token::CloseSquareBracket => todo!(), // Self::RBracket, + Token::Comma => Self::Comma, + Token::Colon => Self::Colon, + Token::Semi => Self::Semi, + Token::Elipsis3 => todo!(), + Token::Elipsis2 => todo!(), + Token::Equal => todo!(), + Token::Void => Self::Void, + Token::Bool => Self::Bool, + Token::F32 => Self::F32, + Token::F64 => Self::F64, + Token::ISize => Self::ISize, + Token::USize => Self::USize, + Token::U1 => Self::U1, + Token::U8 => Self::U8, + Token::U16 => Self::U16, + Token::U32 => Self::U32, + Token::U64 => Self::U64, + Token::I1 => Self::I1, + Token::I8 => Self::I8, + Token::I16 => Self::I16, + Token::I32 => Self::I32, + Token::I64 => Self::I64, + Token::Const => todo!(), // Self::Const, + Token::Mutable => Self::Mutable, + Token::Volatile => todo!(), + Token::Noalias => todo!(), + Token::Let => todo!(), + Token::Var => todo!(), + Token::If => todo!(), + Token::As => todo!(), + Token::Else => todo!(), + Token::Return => todo!(), + Token::Struct => todo!(), + Token::Type => todo!(), + Token::Union => todo!(), + Token::Enum => todo!(), + Token::Packed => todo!(), + Token::Extern => todo!(), + Token::Pub => Self::Pub, + Token::Module => todo!(), + Token::Dot => todo!(), + Token::MinusGreater => Self::Arrow, + Token::Bang => Self::Bang, + Token::Tilde => todo!(), + Token::Plus => todo!(), + Token::Minus => todo!(), + Token::Star => todo!(), + Token::Slash => todo!(), + Token::Percent => todo!(), + Token::Less => todo!(), + Token::Greater => todo!(), + Token::LessEqual => todo!(), + Token::GreaterEqual => todo!(), + Token::EqualEqual => todo!(), + Token::BangEqual => todo!(), + Token::PipePipe => todo!(), + Token::AmpersandAmpersand => todo!(), + Token::Ampersand => todo!(), + Token::Caret => todo!(), + Token::Pipe => todo!(), + Token::LessLess => todo!(), + Token::GreaterGreater => todo!(), + Token::Question => todo!(), + Token::PlusEqual => todo!(), + Token::MinusEqual => todo!(), + Token::StarEqual => todo!(), + Token::SlashEqual => todo!(), + Token::PercentEqual => todo!(), + Token::AmpersandEqual => todo!(), + Token::PipeEqual => todo!(), + Token::CaretEqual => todo!(), + Token::LessLessEqual => todo!(), + Token::GreaterGreaterEqual => todo!(), + Token::Eof(_) => todo!(), + Token::ParseError(_) => todo!(), + Token::CharConstant(_) => todo!(), + Token::IntegerConstant(_) => todo!(), + Token::IntegerHexConstant(_) => todo!(), + Token::IntegerBinConstant(_) => todo!(), + Token::IntegerOctConstant(_) => todo!(), + Token::FloatingConstant(_) => todo!(), + Token::FloatingExpConstant(_) => todo!(), + Token::DotFloatingConstant(_) => todo!(), + Token::DotFloatingExpConstant(_) => todo!(), + Token::StringConstant(_) => todo!(), + } + } +} + #[cfg(test)] mod tests { use crate::AstNode; @@ -380,4 +551,19 @@ mod tests { fn print_ast_node_size() { eprintln!("Size of AstNode: {}", std::mem::size_of::()); } + + #[test] + fn parse() { + use crate::parser::{Parser, Token}; + let input = "fn main(a: u32, b: u32) -> u32 {}"; + let mut lex = lexer::TokenIterator::new(input); + let mut mapped = lex.inspect(|t| eprintln!("{t:?}")).map(Token::from); + let mut ast = crate::Ast::new(); + let mut parser = Parser::new(ast); + while let Some(token) = mapped.next() { + parser.parse(token).unwrap(); + } + let (out, ast) = parser.end_of_input().unwrap(); + eprintln!("AST: {:#?}", ast); + } }