lexer crate
This commit is contained in:
parent
0fa3200b85
commit
bc0acf7e19
14
Cargo.toml
14
Cargo.toml
|
@ -1,3 +1,9 @@
|
|||
[workspace]
|
||||
resolver = "3"
|
||||
members = [
|
||||
"crates/lexer"
|
||||
]
|
||||
|
||||
[package]
|
||||
name = "compiler"
|
||||
version = "0.1.0"
|
||||
|
@ -16,5 +22,13 @@ paste = "1.0.15"
|
|||
petgraph = "0.6.5"
|
||||
thiserror = "1.0.63"
|
||||
unicode-xid = "0.2.4"
|
||||
tracing = "0.1.41"
|
||||
|
||||
werkzeug = { path = "../../rust/werkzeug" }
|
||||
|
||||
|
||||
[workspace.dependencies]
|
||||
unicode-xid = "0.2.4"
|
||||
tracing = "0.1.41"
|
||||
|
||||
werkzeug = { path = "../../rust/werkzeug" }
|
9
crates/lexer/Cargo.toml
Normal file
9
crates/lexer/Cargo.toml
Normal file
|
@ -0,0 +1,9 @@
|
|||
[package]
|
||||
name = "lexer"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
tracing = { workspace = true }
|
||||
werkzeug = { workspace = true }
|
||||
unicode-xid = { workspace = true }
|
444
crates/lexer/src/lib.rs
Normal file
444
crates/lexer/src/lib.rs
Normal file
|
@ -0,0 +1,444 @@
|
|||
#![feature(slice_swap_unchecked, iter_collect_into)]
|
||||
|
||||
mod is_things {
|
||||
/// True if `c` is considered a whitespace according to Rust language definition.
|
||||
/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
|
||||
/// for definitions of these classes.
|
||||
pub fn is_whitespace(c: char) -> bool {
|
||||
// This is Pattern_White_Space.
|
||||
//
|
||||
// Note that this set is stable (ie, it doesn't change with different
|
||||
// Unicode versions), so it's ok to just hard-code the values.
|
||||
|
||||
matches!(
|
||||
c,
|
||||
// Usual ASCII suspects
|
||||
'\u{0009}' // \t
|
||||
| '\u{000A}' // \n
|
||||
| '\u{000B}' // vertical tab
|
||||
| '\u{000C}' // form feed
|
||||
| '\u{000D}' // \r
|
||||
| '\u{0020}' // space
|
||||
|
||||
// NEXT LINE from latin1
|
||||
| '\u{0085}'
|
||||
|
||||
// Bidi markers
|
||||
| '\u{200E}' // LEFT-TO-RIGHT MARK
|
||||
| '\u{200F}' // RIGHT-TO-LEFT MARK
|
||||
|
||||
// Dedicated whitespace characters from Unicode
|
||||
| '\u{2028}' // LINE SEPARATOR
|
||||
| '\u{2029}' // PARAGRAPH SEPARATOR
|
||||
)
|
||||
}
|
||||
|
||||
/// True if `c` is valid as a first character of an identifier.
|
||||
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
|
||||
/// a formal definition of valid identifier name.
|
||||
pub fn is_id_start(c: char) -> bool {
|
||||
// This is XID_Start OR '_' (which formally is not a XID_Start).
|
||||
c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
|
||||
}
|
||||
|
||||
/// True if `c` is valid as a non-first character of an identifier.
|
||||
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
|
||||
/// a formal definition of valid identifier name.
|
||||
pub fn is_id_continue(c: char) -> bool {
|
||||
unicode_xid::UnicodeXID::is_xid_continue(c)
|
||||
}
|
||||
|
||||
/// The passed string is lexically an identifier.
|
||||
pub fn is_ident(string: &str) -> bool {
|
||||
let mut chars = string.chars();
|
||||
if let Some(start) = chars.next() {
|
||||
is_id_start(start) && chars.all(is_id_continue)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_digit(ch: char) -> bool {
|
||||
('0'..='9').contains(&ch)
|
||||
}
|
||||
|
||||
pub fn is_bin_digit(ch: char) -> bool {
|
||||
ch == '0' || ch == '1'
|
||||
}
|
||||
|
||||
pub fn is_nonzero_digit(ch: char) -> bool {
|
||||
('1'..='9').contains(&ch)
|
||||
}
|
||||
|
||||
pub fn is_oct_digit(ch: char) -> bool {
|
||||
('0'..='7').contains(&ch)
|
||||
}
|
||||
|
||||
pub fn is_hex_digit(ch: char) -> bool {
|
||||
('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! tokens {
|
||||
($vis:vis $ty_name:ident:
|
||||
{
|
||||
$($name2:ident),*
|
||||
},
|
||||
{
|
||||
$($name:ident => $lexeme:literal),*
|
||||
}) => {
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
|
||||
$vis enum $ty_name {
|
||||
$($name,
|
||||
)*
|
||||
$($name2,)*
|
||||
}
|
||||
|
||||
impl ::core::fmt::Display for $ty_name {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
|
||||
match self {
|
||||
$(Self::$name => write!(f, "{}", $lexeme),)*
|
||||
$(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl $ty_name {
|
||||
$vis fn lexeme(&self) -> Option<&'static str> {
|
||||
match self {
|
||||
$(Self::$name => Some($lexeme),)*
|
||||
$(Self::$name2 => None),*
|
||||
}
|
||||
}
|
||||
|
||||
/// returns the number of chars in this lexeme
|
||||
$vis fn lexeme_len(&self) -> usize {
|
||||
self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
|
||||
}
|
||||
|
||||
$vis fn maybe_ident(&self) -> bool {
|
||||
self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false)
|
||||
}
|
||||
|
||||
$vis fn lexemes() -> &'static [(Self, &'static str)] {
|
||||
&[
|
||||
$((Self::$name, $lexeme)),*
|
||||
]
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
tokens!(pub Token: {
|
||||
Eof,
|
||||
ParseError,
|
||||
// Marker Token for any Comment
|
||||
Comment,
|
||||
DocComment,
|
||||
// Marker Token for any pre-processing directive
|
||||
CharConstant,
|
||||
IntegerConstant,
|
||||
IntegerHexConstant,
|
||||
IntegerBinConstant,
|
||||
IntegerOctConstant,
|
||||
FloatingConstant,
|
||||
FloatingExpConstant,
|
||||
DotFloatingConstant,
|
||||
DotFloatingExpConstant,
|
||||
StringConstant,
|
||||
IntegralType,
|
||||
Ident
|
||||
},
|
||||
// Lexical Tokens:
|
||||
{
|
||||
SlashSlash => "//",
|
||||
SlashStar => "/*",
|
||||
// SlashStarStar => "/**",
|
||||
StarSlash => "*/",
|
||||
// SlashSlashSlash => "///",
|
||||
// Punctuation:
|
||||
OpenParens => "(",
|
||||
CloseParens => ")",
|
||||
OpenBrace => "{",
|
||||
CloseBrace => "}",
|
||||
OpenSquareBracket => "[",
|
||||
CloseSquareBracket => "]",
|
||||
Semi => ";",
|
||||
Comma => ",",
|
||||
Elipsis3 => "...",
|
||||
Elipsis2 => "..",
|
||||
Colon => ":",
|
||||
Equal => "=",
|
||||
// Keywords:
|
||||
Void => "void",
|
||||
Bool => "bool",
|
||||
F32 => "f32",
|
||||
F64 => "f64",
|
||||
ISize => "isize",
|
||||
USize => "usize",
|
||||
Const => "const",
|
||||
Volatile => "volatile",
|
||||
Noalias => "noalias",
|
||||
Fn => "fn",
|
||||
Let => "let",
|
||||
Var => "var",
|
||||
If => "if",
|
||||
As => "as",
|
||||
Else => "else",
|
||||
Return => "return",
|
||||
Struct => "struct",
|
||||
Type => "type",
|
||||
Union => "union",
|
||||
Enum => "enum",
|
||||
Packed => "packed",
|
||||
Extern => "extern",
|
||||
Pub => "pub",
|
||||
// Operators
|
||||
Dot => ".",
|
||||
MinusGreater => "->",
|
||||
Bang => "!",
|
||||
Tilde => "~",
|
||||
Plus => "+",
|
||||
Minus => "-",
|
||||
Star => "*",
|
||||
Slash => "/",
|
||||
Percent => "%",
|
||||
Less => "<",
|
||||
Greater => ">",
|
||||
LessEqual => "<=",
|
||||
GreaterEqual => ">=",
|
||||
EqualEqual => "==",
|
||||
BangEqual => "!=",
|
||||
PipePipe => "||",
|
||||
AmpersandAmpersand => "&&",
|
||||
Ampersand => "&",
|
||||
Caret => "^",
|
||||
Pipe => "|",
|
||||
LessLess => "<<",
|
||||
GreaterGreater => ">>",
|
||||
Question => "?",
|
||||
PlusEqual => "+=",
|
||||
MinusEqual => "-=",
|
||||
StarEqual => "*=",
|
||||
SlashEqual => "/=",
|
||||
PercentEqual => "%=",
|
||||
AmpersandEqual => "&=",
|
||||
PipeEqual => "|=",
|
||||
CaretEqual => "^=",
|
||||
LessLessEqual => "<<=",
|
||||
GreaterGreaterEqual => ">>="
|
||||
});
|
||||
|
||||
impl Token {
|
||||
pub fn is_assignment_op(self) -> bool {
|
||||
match self {
|
||||
Token::PlusEqual
|
||||
| Token::MinusEqual
|
||||
| Token::StarEqual
|
||||
| Token::SlashEqual
|
||||
| Token::PercentEqual
|
||||
| Token::PipeEqual
|
||||
| Token::CaretEqual
|
||||
| Token::AmpersandEqual
|
||||
| Token::LessLessEqual
|
||||
| Token::GreaterGreaterEqual
|
||||
| Token::Equal => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
pub fn is_unary_op(self) -> bool {
|
||||
match self {
|
||||
Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
pub fn is_binary_op(self) -> bool {
|
||||
match self {
|
||||
Token::Star
|
||||
| Token::Slash
|
||||
| Token::Percent
|
||||
| Token::Pipe
|
||||
| Token::Ampersand
|
||||
| Token::Caret
|
||||
| Token::Plus
|
||||
| Token::Minus
|
||||
| Token::PipePipe
|
||||
| Token::AmpersandAmpersand
|
||||
| Token::BangEqual
|
||||
| Token::EqualEqual
|
||||
| Token::Less
|
||||
| Token::Greater
|
||||
| Token::LessEqual
|
||||
| Token::GreaterEqual
|
||||
| Token::LessLess
|
||||
| Token::GreaterGreater => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A list of lexemes used by the `LexemeParser`.
|
||||
/// `lexemes` contains every token that has a defined lexeme, such as `fn`, `f32`, `const`, etc.
|
||||
/// The `LexemeList` keeps track of two offsets into the `lexemes` array,
|
||||
/// splitting it into three windows:
|
||||
/// - [0, start_candidates) - tokens that are still being considered for parsing
|
||||
/// - [start_candidates, end_candidates) - the tokens which this lexeme matches
|
||||
/// - [end_candidates, len) - tokens that have been filtered out and are no longer considered
|
||||
/// On each iteration of the parsing loop, the remaining tokens are matched
|
||||
/// against the next character and, if they match completely, are swapped into
|
||||
/// the candidates window, or swapped to the end if they don't.
|
||||
struct LexemeList {
|
||||
lexemes: Box<[Token]>,
|
||||
start_candidates: usize,
|
||||
end_candidates: usize,
|
||||
filtered: Vec<(usize, FilterResult)>,
|
||||
}
|
||||
|
||||
enum FilterResult {
|
||||
Remove,
|
||||
Candidate,
|
||||
}
|
||||
|
||||
impl LexemeList {
|
||||
fn new() -> Self {
|
||||
let lexemes = Token::lexemes()
|
||||
.iter()
|
||||
.map(|(tok, _)| tok.clone())
|
||||
.collect::<Box<_>>();
|
||||
|
||||
Self {
|
||||
start_candidates: lexemes.len(),
|
||||
end_candidates: lexemes.len(),
|
||||
lexemes,
|
||||
filtered: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.start_candidates = self.lexemes.len();
|
||||
self.end_candidates = self.lexemes.len();
|
||||
}
|
||||
|
||||
fn remaining(&self) -> &[Token] {
|
||||
&self.lexemes[0..self.start_candidates]
|
||||
}
|
||||
|
||||
fn candidates(&self) -> &[Token] {
|
||||
&self.lexemes[self.start_candidates..self.end_candidates]
|
||||
}
|
||||
|
||||
fn step(&mut self, ch: char, pos: usize) {
|
||||
// smartly reuse allocation for `filtered`
|
||||
// truly one of the premature optimizations.
|
||||
// but it just feels good, innit?
|
||||
let mut filtered = core::mem::take(&mut self.filtered);
|
||||
|
||||
self.remaining()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, tok)| {
|
||||
let bytes = tok.lexeme().unwrap().as_bytes();
|
||||
// SAFETY: all tokens in `self.remaining()` are lexical tokens, and
|
||||
// they are all valid ascii
|
||||
let c = unsafe {
|
||||
// TODO: maybe keep a list of `Char<'_>`s around in order to
|
||||
// support fully utf8 tokens?
|
||||
char::from_u32_unchecked(bytes[pos] as u32)
|
||||
};
|
||||
match c == ch {
|
||||
false => Some((i, FilterResult::Remove)),
|
||||
true if bytes.len() <= pos + 1 => Some((i, FilterResult::Candidate)),
|
||||
true => None,
|
||||
}
|
||||
})
|
||||
.collect_into(&mut filtered);
|
||||
|
||||
// iterate in reverse order so that we can safely swap elements
|
||||
// drain here so that we can possibly reuse the `filtered` Vec allcoation
|
||||
filtered.drain(..).rev().for_each(|(i, f)| {
|
||||
match f {
|
||||
// for candidates, swap the candidate with the last remaining
|
||||
// token, then dec `start_candidates`
|
||||
FilterResult::Candidate => {
|
||||
// SAFETY: we know that `i` and `self.start_candidates - 1`
|
||||
// are both valid indices: `self.start_candidates` starts at
|
||||
// the end and each time it is decremented, one more element
|
||||
// is removed from the front, so that as long as an element
|
||||
// is remaining, `self.start_candidates` is always greater
|
||||
// than 0.
|
||||
// the order of the remaining elements is not meaningfully
|
||||
// impacted because we only ever swap with elements after
|
||||
// `i`, and `i` is the greatest index we will touch.
|
||||
unsafe {
|
||||
self.lexemes.swap_unchecked(i, self.start_candidates - 1);
|
||||
self.start_candidates = self.start_candidates.saturating_sub(1);
|
||||
}
|
||||
}
|
||||
// for removes, swap the last candidate with the last remainign
|
||||
// token, then swap the remove with the last candidate, then dec
|
||||
// `end_candidates` and `start_candidates`
|
||||
FilterResult::Remove => {
|
||||
unsafe {
|
||||
// in the case that `start_candidates` ==
|
||||
// `end_candidates`, no swap happens and that's fine.
|
||||
// remove this: v
|
||||
// [a,b,c][d,e,f][g,h,i]
|
||||
// swap these: ^ ^
|
||||
// [a,b,f][d,e,c][g,h,i]
|
||||
// swap these: ^ ^
|
||||
// [a,c,f][d,e,b][g,h,i]
|
||||
// decrement both counters:
|
||||
// [a,c][f,d,e][b,g,h,i]
|
||||
self.lexemes
|
||||
.swap_unchecked(self.start_candidates - 1, self.end_candidates - 1);
|
||||
self.lexemes.swap_unchecked(i, self.end_candidates - 1);
|
||||
self.start_candidates = self.start_candidates.saturating_sub(1);
|
||||
self.end_candidates = self.end_candidates.saturating_sub(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// replace `filtered`
|
||||
self.filtered = filtered;
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
|
||||
/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
|
||||
/// integral types, constants or identifiers are not parsed by this.
|
||||
pub struct LexemeParser {
|
||||
lexemes: LexemeList,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl LexemeParser {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
lexemes: LexemeList::new(),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse(&mut self, mut tokens: impl Iterator<Item = char>) -> Option<Token> {
|
||||
self.lexemes.clear();
|
||||
loop {
|
||||
let Some(ch) = tokens.next() else {
|
||||
break;
|
||||
};
|
||||
|
||||
if crate::is_things::is_whitespace(ch) {
|
||||
break;
|
||||
}
|
||||
|
||||
self.lexemes.step(ch, self.len);
|
||||
if self.lexemes.remaining().is_empty() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
self.lexemes.candidates().last().copied()
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue