diff --git a/Cargo.toml b/Cargo.toml index 71dff5b..48c08be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,9 @@ +[workspace] +resolver = "3" +members = [ + "crates/lexer" +] + [package] name = "compiler" version = "0.1.0" @@ -16,5 +22,13 @@ paste = "1.0.15" petgraph = "0.6.5" thiserror = "1.0.63" unicode-xid = "0.2.4" +tracing = "0.1.41" + +werkzeug = { path = "../../rust/werkzeug" } + + +[workspace.dependencies] +unicode-xid = "0.2.4" +tracing = "0.1.41" werkzeug = { path = "../../rust/werkzeug" } \ No newline at end of file diff --git a/crates/lexer/Cargo.toml b/crates/lexer/Cargo.toml new file mode 100644 index 0000000..65b3975 --- /dev/null +++ b/crates/lexer/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "lexer" +version = "0.1.0" +edition = "2024" + +[dependencies] +tracing = { workspace = true } +werkzeug = { workspace = true } +unicode-xid = { workspace = true } \ No newline at end of file diff --git a/crates/lexer/src/lib.rs b/crates/lexer/src/lib.rs new file mode 100644 index 0000000..8b53c6c --- /dev/null +++ b/crates/lexer/src/lib.rs @@ -0,0 +1,444 @@ +#![feature(slice_swap_unchecked, iter_collect_into)] + +mod is_things { + /// True if `c` is considered a whitespace according to Rust language definition. + /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) + /// for definitions of these classes. + pub fn is_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + matches!( + c, + // Usual ASCII suspects + '\u{0009}' // \t + | '\u{000A}' // \n + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + | '\u{000D}' // \r + | '\u{0020}' // space + + // NEXT LINE from latin1 + | '\u{0085}' + + // Bidi markers + | '\u{200E}' // LEFT-TO-RIGHT MARK + | '\u{200F}' // RIGHT-TO-LEFT MARK + + // Dedicated whitespace characters from Unicode + | '\u{2028}' // LINE SEPARATOR + | '\u{2029}' // PARAGRAPH SEPARATOR + ) + } + + /// True if `c` is valid as a first character of an identifier. + /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for + /// a formal definition of valid identifier name. + pub fn is_id_start(c: char) -> bool { + // This is XID_Start OR '_' (which formally is not a XID_Start). + c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) + } + + /// True if `c` is valid as a non-first character of an identifier. + /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for + /// a formal definition of valid identifier name. + pub fn is_id_continue(c: char) -> bool { + unicode_xid::UnicodeXID::is_xid_continue(c) + } + + /// The passed string is lexically an identifier. + pub fn is_ident(string: &str) -> bool { + let mut chars = string.chars(); + if let Some(start) = chars.next() { + is_id_start(start) && chars.all(is_id_continue) + } else { + false + } + } + + pub fn is_digit(ch: char) -> bool { + ('0'..='9').contains(&ch) + } + + pub fn is_bin_digit(ch: char) -> bool { + ch == '0' || ch == '1' + } + + pub fn is_nonzero_digit(ch: char) -> bool { + ('1'..='9').contains(&ch) + } + + pub fn is_oct_digit(ch: char) -> bool { + ('0'..='7').contains(&ch) + } + + pub fn is_hex_digit(ch: char) -> bool { + ('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch) + } +} + +macro_rules! tokens { + ($vis:vis $ty_name:ident: + { + $($name2:ident),* + }, + { + $($name:ident => $lexeme:literal),* + }) => { + + #[allow(dead_code)] + #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] + $vis enum $ty_name { + $($name, + )* + $($name2,)* + } + + impl ::core::fmt::Display for $ty_name { + fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { + match self { + $(Self::$name => write!(f, "{}", $lexeme),)* + $(Self::$name2 => write!(f, "<{}>", stringify!($name2))),* + } + } + } + + #[allow(dead_code)] + impl $ty_name { + $vis fn lexeme(&self) -> Option<&'static str> { + match self { + $(Self::$name => Some($lexeme),)* + $(Self::$name2 => None),* + } + } + + /// returns the number of chars in this lexeme + $vis fn lexeme_len(&self) -> usize { + self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0) + } + + $vis fn maybe_ident(&self) -> bool { + self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false) + } + + $vis fn lexemes() -> &'static [(Self, &'static str)] { + &[ + $((Self::$name, $lexeme)),* + ] + } + } + }; +} + +tokens!(pub Token: { + Eof, + ParseError, + // Marker Token for any Comment + Comment, + DocComment, + // Marker Token for any pre-processing directive + CharConstant, + IntegerConstant, + IntegerHexConstant, + IntegerBinConstant, + IntegerOctConstant, + FloatingConstant, + FloatingExpConstant, + DotFloatingConstant, + DotFloatingExpConstant, + StringConstant, + IntegralType, + Ident +}, + // Lexical Tokens: + { + SlashSlash => "//", + SlashStar => "/*", + // SlashStarStar => "/**", + StarSlash => "*/", + // SlashSlashSlash => "///", + // Punctuation: + OpenParens => "(", + CloseParens => ")", + OpenBrace => "{", + CloseBrace => "}", + OpenSquareBracket => "[", + CloseSquareBracket => "]", + Semi => ";", + Comma => ",", + Elipsis3 => "...", + Elipsis2 => "..", + Colon => ":", + Equal => "=", + // Keywords: + Void => "void", + Bool => "bool", + F32 => "f32", + F64 => "f64", + ISize => "isize", + USize => "usize", + Const => "const", + Volatile => "volatile", + Noalias => "noalias", + Fn => "fn", + Let => "let", + Var => "var", + If => "if", + As => "as", + Else => "else", + Return => "return", + Struct => "struct", + Type => "type", + Union => "union", + Enum => "enum", + Packed => "packed", + Extern => "extern", + Pub => "pub", + // Operators + Dot => ".", + MinusGreater => "->", + Bang => "!", + Tilde => "~", + Plus => "+", + Minus => "-", + Star => "*", + Slash => "/", + Percent => "%", + Less => "<", + Greater => ">", + LessEqual => "<=", + GreaterEqual => ">=", + EqualEqual => "==", + BangEqual => "!=", + PipePipe => "||", + AmpersandAmpersand => "&&", + Ampersand => "&", + Caret => "^", + Pipe => "|", + LessLess => "<<", + GreaterGreater => ">>", + Question => "?", + PlusEqual => "+=", + MinusEqual => "-=", + StarEqual => "*=", + SlashEqual => "/=", + PercentEqual => "%=", + AmpersandEqual => "&=", + PipeEqual => "|=", + CaretEqual => "^=", + LessLessEqual => "<<=", + GreaterGreaterEqual => ">>=" + }); + +impl Token { + pub fn is_assignment_op(self) -> bool { + match self { + Token::PlusEqual + | Token::MinusEqual + | Token::StarEqual + | Token::SlashEqual + | Token::PercentEqual + | Token::PipeEqual + | Token::CaretEqual + | Token::AmpersandEqual + | Token::LessLessEqual + | Token::GreaterGreaterEqual + | Token::Equal => true, + _ => false, + } + } + pub fn is_unary_op(self) -> bool { + match self { + Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true, + _ => false, + } + } + pub fn is_binary_op(self) -> bool { + match self { + Token::Star + | Token::Slash + | Token::Percent + | Token::Pipe + | Token::Ampersand + | Token::Caret + | Token::Plus + | Token::Minus + | Token::PipePipe + | Token::AmpersandAmpersand + | Token::BangEqual + | Token::EqualEqual + | Token::Less + | Token::Greater + | Token::LessEqual + | Token::GreaterEqual + | Token::LessLess + | Token::GreaterGreater => true, + _ => false, + } + } +} + +/// A list of lexemes used by the `LexemeParser`. +/// `lexemes` contains every token that has a defined lexeme, such as `fn`, `f32`, `const`, etc. +/// The `LexemeList` keeps track of two offsets into the `lexemes` array, +/// splitting it into three windows: +/// - [0, start_candidates) - tokens that are still being considered for parsing +/// - [start_candidates, end_candidates) - the tokens which this lexeme matches +/// - [end_candidates, len) - tokens that have been filtered out and are no longer considered +/// On each iteration of the parsing loop, the remaining tokens are matched +/// against the next character and, if they match completely, are swapped into +/// the candidates window, or swapped to the end if they don't. +struct LexemeList { + lexemes: Box<[Token]>, + start_candidates: usize, + end_candidates: usize, + filtered: Vec<(usize, FilterResult)>, +} + +enum FilterResult { + Remove, + Candidate, +} + +impl LexemeList { + fn new() -> Self { + let lexemes = Token::lexemes() + .iter() + .map(|(tok, _)| tok.clone()) + .collect::>(); + + Self { + start_candidates: lexemes.len(), + end_candidates: lexemes.len(), + lexemes, + filtered: Vec::new(), + } + } + + fn clear(&mut self) { + self.start_candidates = self.lexemes.len(); + self.end_candidates = self.lexemes.len(); + } + + fn remaining(&self) -> &[Token] { + &self.lexemes[0..self.start_candidates] + } + + fn candidates(&self) -> &[Token] { + &self.lexemes[self.start_candidates..self.end_candidates] + } + + fn step(&mut self, ch: char, pos: usize) { + // smartly reuse allocation for `filtered` + // truly one of the premature optimizations. + // but it just feels good, innit? + let mut filtered = core::mem::take(&mut self.filtered); + + self.remaining() + .iter() + .enumerate() + .filter_map(|(i, tok)| { + let bytes = tok.lexeme().unwrap().as_bytes(); + // SAFETY: all tokens in `self.remaining()` are lexical tokens, and + // they are all valid ascii + let c = unsafe { + // TODO: maybe keep a list of `Char<'_>`s around in order to + // support fully utf8 tokens? + char::from_u32_unchecked(bytes[pos] as u32) + }; + match c == ch { + false => Some((i, FilterResult::Remove)), + true if bytes.len() <= pos + 1 => Some((i, FilterResult::Candidate)), + true => None, + } + }) + .collect_into(&mut filtered); + + // iterate in reverse order so that we can safely swap elements + // drain here so that we can possibly reuse the `filtered` Vec allcoation + filtered.drain(..).rev().for_each(|(i, f)| { + match f { + // for candidates, swap the candidate with the last remaining + // token, then dec `start_candidates` + FilterResult::Candidate => { + // SAFETY: we know that `i` and `self.start_candidates - 1` + // are both valid indices: `self.start_candidates` starts at + // the end and each time it is decremented, one more element + // is removed from the front, so that as long as an element + // is remaining, `self.start_candidates` is always greater + // than 0. + // the order of the remaining elements is not meaningfully + // impacted because we only ever swap with elements after + // `i`, and `i` is the greatest index we will touch. + unsafe { + self.lexemes.swap_unchecked(i, self.start_candidates - 1); + self.start_candidates = self.start_candidates.saturating_sub(1); + } + } + // for removes, swap the last candidate with the last remainign + // token, then swap the remove with the last candidate, then dec + // `end_candidates` and `start_candidates` + FilterResult::Remove => { + unsafe { + // in the case that `start_candidates` == + // `end_candidates`, no swap happens and that's fine. + // remove this: v + // [a,b,c][d,e,f][g,h,i] + // swap these: ^ ^ + // [a,b,f][d,e,c][g,h,i] + // swap these: ^ ^ + // [a,c,f][d,e,b][g,h,i] + // decrement both counters: + // [a,c][f,d,e][b,g,h,i] + self.lexemes + .swap_unchecked(self.start_candidates - 1, self.end_candidates - 1); + self.lexemes.swap_unchecked(i, self.end_candidates - 1); + self.start_candidates = self.start_candidates.saturating_sub(1); + self.end_candidates = self.end_candidates.saturating_sub(1); + } + } + } + }); + + // replace `filtered` + self.filtered = filtered; + } +} + +/// Helper type for parsing tokens that have a defined lexeme, such as `fn`, +/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive +/// integral types, constants or identifiers are not parsed by this. +pub struct LexemeParser { + lexemes: LexemeList, + len: usize, +} + +impl LexemeParser { + pub fn new() -> Self { + Self { + lexemes: LexemeList::new(), + len: 0, + } + } + + pub fn parse(&mut self, mut tokens: impl Iterator) -> Option { + self.lexemes.clear(); + loop { + let Some(ch) = tokens.next() else { + break; + }; + + if crate::is_things::is_whitespace(ch) { + break; + } + + self.lexemes.step(ch, self.len); + if self.lexemes.remaining().is_empty() { + break; + } + } + self.lexemes.candidates().last().copied() + } +}