SeaLang/crates/lexer/src/lib.rs

564 lines
17 KiB
Rust

#![feature(slice_swap_unchecked, iter_collect_into)]
mod is_things {
/// True if `c` is considered a whitespace according to Rust language definition.
/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
/// for definitions of these classes.
pub fn is_whitespace(c: char) -> bool {
// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.
matches!(
c,
// Usual ASCII suspects
'\u{0009}' // \t
| '\u{000A}' // \n
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space
// NEXT LINE from latin1
| '\u{0085}'
// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK
// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
)
}
/// True if `c` is valid as a first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_start(c: char) -> bool {
// This is XID_Start OR '_' (which formally is not a XID_Start).
c == '_' || c == '-' || unicode_xid::UnicodeXID::is_xid_start(c)
}
/// True if `c` is valid as a non-first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_continue(c: char) -> bool {
unicode_xid::UnicodeXID::is_xid_continue(c) || c == '-'
}
/// The passed string is lexically an identifier.
pub fn is_ident(string: &str) -> bool {
let mut chars = string.chars();
if let Some(start) = chars.next() {
is_id_start(start) && chars.all(is_id_continue)
} else {
false
}
}
pub fn is_digit(ch: char) -> bool {
('0'..='9').contains(&ch)
}
pub fn is_bin_digit(ch: char) -> bool {
ch == '0' || ch == '1'
}
#[expect(dead_code)]
pub fn is_nonzero_digit(ch: char) -> bool {
('1'..='9').contains(&ch)
}
pub fn is_oct_digit(ch: char) -> bool {
('0'..='7').contains(&ch)
}
pub fn is_hex_digit(ch: char) -> bool {
('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
}
}
macro_rules! tokens {
($vis:vis $ty_name:ident:
{
$($name2:ident),*
},
{
$($name:ident => $lexeme:literal),*
}) => {
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)]
$vis enum $ty_name {
$($name,
)*
$($name2,)*
}
impl ::core::fmt::Display for $ty_name {
fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result {
match self {
$(Self::$name => write!(f, "{}", $lexeme),)*
$(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
}
}
}
#[allow(dead_code)]
impl $ty_name {
$vis fn lexeme(&self) -> Option<&'static str> {
match self {
$(Self::$name => Some($lexeme),)*
$(Self::$name2 => None),*
}
}
/// returns the number of chars in this lexeme
$vis fn lexeme_len(&self) -> usize {
self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
}
/// returns the number of chars in this lexeme
$vis fn lexeme_len_utf8(&self) -> usize {
self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0)
}
$vis fn maybe_ident(&self) -> bool {
self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false)
}
$vis fn lexemes() -> &'static [(Self, &'static str)] {
&[
$((Self::$name, $lexeme)),*
]
}
}
};
}
tokens!(pub Token: {
Eof,
ParseError,
// Marker Token for any Comment
Comment,
DocComment,
// Marker Token for any pre-processing directive
CharConstant,
IntegerConstant,
IntegerHexConstant,
IntegerBinConstant,
IntegerOctConstant,
FloatingConstant,
FloatingExpConstant,
DotFloatingConstant,
DotFloatingExpConstant,
StringConstant,
Ident
},
// Lexical Tokens:
{
SlashSlash => "//",
SlashSlashSlash => "///",
// SlashStar => "/*",
// SlashStarStar => "/**",
//StarSlash => "*/",
// Punctuation:
OpenParens => "(",
CloseParens => ")",
OpenBrace => "{",
CloseBrace => "}",
OpenSquareBracket => "[",
CloseSquareBracket => "]",
Semi => ";",
Comma => ",",
Elipsis3 => "...",
Elipsis2 => "..",
Colon => ":",
Equal => "=",
// Keywords:
Void => "void",
Bool => "bool",
F32 => "f32",
F64 => "f64",
ISize => "isize",
USize => "usize",
U1 => "u1",
U8 => "u8",
U16 => "u16",
U32 => "u32",
U64 => "u64",
I1 => "i1",
I8 => "i8",
I16 => "i16",
I32 => "i32",
I64 => "i64",
Const => "const",
Volatile => "volatile",
Noalias => "noalias",
Fn => "fn",
Let => "let",
Var => "var",
If => "if",
As => "as",
Else => "else",
Return => "return",
Struct => "struct",
Type => "type",
Union => "union",
Enum => "enum",
Packed => "packed",
Extern => "extern",
Pub => "pub",
// Operators
Dot => ".",
MinusGreater => "->",
Bang => "!",
Tilde => "~",
Plus => "+",
// PlusPlus => "++",
Minus => "-",
// MinusMinus => "--",
Star => "*",
Slash => "/",
Percent => "%",
Less => "<",
Greater => ">",
LessEqual => "<=",
GreaterEqual => ">=",
EqualEqual => "==",
BangEqual => "!=",
PipePipe => "||",
AmpersandAmpersand => "&&",
Ampersand => "&",
Caret => "^",
Pipe => "|",
LessLess => "<<",
GreaterGreater => ">>",
Question => "?",
PlusEqual => "+=",
MinusEqual => "-=",
StarEqual => "*=",
SlashEqual => "/=",
PercentEqual => "%=",
AmpersandEqual => "&=",
PipeEqual => "|=",
CaretEqual => "^=",
LessLessEqual => "<<=",
GreaterGreaterEqual => ">>="
});
impl Token {
pub fn is_assignment_op(self) -> bool {
match self {
Token::PlusEqual
| Token::MinusEqual
| Token::StarEqual
| Token::SlashEqual
| Token::PercentEqual
| Token::PipeEqual
| Token::CaretEqual
| Token::AmpersandEqual
| Token::LessLessEqual
| Token::GreaterGreaterEqual
| Token::Equal => true,
_ => false,
}
}
pub fn is_unary_op(self) -> bool {
match self {
Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true,
_ => false,
}
}
pub fn is_binary_op(self) -> bool {
match self {
Token::Star
| Token::Slash
| Token::Percent
| Token::Pipe
| Token::Ampersand
| Token::Caret
| Token::Plus
| Token::Minus
| Token::PipePipe
| Token::AmpersandAmpersand
| Token::BangEqual
| Token::EqualEqual
| Token::Less
| Token::Greater
| Token::LessEqual
| Token::GreaterEqual
| Token::LessLess
| Token::GreaterGreater => true,
_ => false,
}
}
}
use trie::Tree;
#[derive(Debug, Clone, Copy)]
struct CountingIterator<I: Iterator> {
iter: I,
count: usize,
}
impl<I: Iterator> From<I> for CountingIterator<I> {
fn from(iter: I) -> Self {
Self { iter, count: 0 }
}
}
impl<I: Iterator<Item = char>> Iterator for CountingIterator<I> {
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().inspect(|c| self.count += c.len_utf8())
}
}
impl<I: Iterator> CountingIterator<I> {
pub(crate) fn offset(&self) -> usize {
self.count
}
}
impl<I: Iterator> core::ops::Deref for CountingIterator<I> {
type Target = I;
fn deref(&self) -> &Self::Target {
&self.iter
}
}
impl<I: Iterator> core::ops::DerefMut for CountingIterator<I> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.iter
}
}
type Source<'a> = CountingIterator<core::iter::Peekable<core::str::Chars<'a>>>;
pub struct TokenIterator<'a> {
trie: Tree<char, Token>,
source: &'a str,
offset: usize,
}
impl<'a> TokenIterator<'a> {
pub fn new(source: &'a str) -> Self {
let mut trie = Tree::new();
for (token, token_str) in Token::lexemes() {
trie.insert(token_str.chars(), *token);
}
Self {
trie,
source,
offset: 0,
}
}
fn peekable_source(&self) -> Source<'a> {
CountingIterator::from(self.source[self.offset..].chars().peekable())
}
fn parse(&mut self) -> Option<Token> {
let mut iter = CountingIterator::from(self.source[self.offset..].chars());
match self.trie.get_closest(&mut iter) {
Some(token) => {
// skip the peeked item
self.offset += token.lexeme_len();
Some(*token)
}
None => None,
}
}
fn skip_whitespaces(&mut self) -> usize {
self.skip_while(is_things::is_whitespace)
}
fn skip(&mut self, mut n: usize) -> usize {
self.skip_while(|_| {
n -= 1;
n > 0
})
}
fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) -> usize {
let mut count = 0;
loop {
let Some(c) = self.source[self.offset..].chars().next() else {
break;
};
if pred(c) {
self.offset += c.len_utf8();
count += c.len_utf8();
continue;
} else {
break;
}
}
count
}
}
impl<'a> Iterator for TokenIterator<'a> {
type Item = (Token, &'a str);
fn next(&mut self) -> Option<Self::Item> {
// skip whitespace
self.skip_whitespaces();
let start = self.offset;
let mut source = self.peekable_source();
let mut cursor = self.peekable_source();
let token = match cursor.next() {
Some('0'..='9') => {
let token = complex_tokens::parse_constant(&mut source).ok()?;
self.offset += source.offset();
Some((token, &self.source[start..self.offset]))
}
Some('.') if cursor.next().map_or(false, is_things::is_digit) => {
let token = complex_tokens::parse_constant(&mut source).ok()?;
self.offset += source.offset();
Some((token, &self.source[start..self.offset]))
}
Some('\'' | '"') => {
let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?;
self.offset += source.offset();
Some((token, &self.source[start..self.offset]))
}
Some('`') => {
// raw identifier
self.skip(1);
self.skip_while(|c| is_things::is_id_continue(c));
if self.peekable_source().next() == Some('`') {
self.skip(1);
Some((Token::Ident, &self.source[start..self.offset]))
} else {
// unterminated raw identifier
Some((Token::ParseError, &self.source[start..self.offset]))
}
}
// `//`-style comments or doc-comments
_ => match self.parse().map(|tok| match tok {
Token::SlashSlash => {
self.skip_while(|c| c == '\n');
Token::Comment
}
Token::SlashSlashSlash => {
self.skip_while(|c| c == '\n');
Token::DocComment
}
_ => tok,
}) {
Some(tok) => {
if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 {
Some((Token::Ident, &self.source[start..self.offset]))
} else {
Some((tok, &self.source[start..self.offset]))
}
}
None => {
if self
.peekable_source()
.next()
.map_or(false, |c| is_things::is_id_start(c))
{
self.skip(1);
self.skip_while(|c| is_things::is_id_continue(c));
Some((Token::Ident, &self.source[start..self.offset]))
} else {
None
}
}
},
};
token
}
}
mod complex_tokens;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_iterator() {
let tokens = "fn let void+(+bool)";
let mut lexer = TokenIterator::new(&tokens);
assert_eq!(lexer.next(), Some((Token::Fn, "fn")));
assert_eq!(lexer.next(), Some((Token::Let, "let")));
assert_eq!(lexer.next(), Some((Token::Void, "void")));
assert_eq!(lexer.next(), Some((Token::Plus, "+")));
assert_eq!(lexer.next(), Some((Token::OpenParens, "(")));
assert_eq!(lexer.next(), Some((Token::Plus, "+")));
assert_eq!(lexer.next(), Some((Token::Bool, "bool")));
assert_eq!(lexer.next(), Some((Token::CloseParens, ")")));
assert_eq!(lexer.next(), None);
}
#[test]
fn idents() {
let lexer = TokenIterator::new("a a1 a_ a-b _a _1 _- -a -1 -_ `123");
assert!(lexer.map(|(tok, _)| tok).all(|tok| tok == Token::Ident));
}
#[test]
fn ident_minus_ambiguity() {
let lexer = TokenIterator::new("a-a a- - a -a --a");
let tokens = lexer.map(|(tok, _)| tok).collect::<Vec<_>>();
assert_eq!(
tokens,
vec![
Token::Ident,
Token::Ident,
Token::Minus,
Token::Ident,
Token::Ident,
Token::Ident
]
);
}
#[test]
fn complex_iterator() {
let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }";
let mut lexer = TokenIterator::new(&tokens);
assert_eq!(lexer.next(), Some((Token::Fn, "fn")));
assert_eq!(lexer.next(), Some((Token::Ident, "my-function")));
assert_eq!(lexer.next(), Some((Token::OpenParens, "(")));
assert_eq!(lexer.next(), Some((Token::Ident, "x")));
assert_eq!(lexer.next(), Some((Token::Colon, ":")));
assert_eq!(lexer.next(), Some((Token::I32, "i32")));
assert_eq!(lexer.next(), Some((Token::Comma, ",")));
assert_eq!(lexer.next(), Some((Token::Ident, "y")));
assert_eq!(lexer.next(), Some((Token::Colon, ":")));
assert_eq!(lexer.next(), Some((Token::F32, "f32")));
assert_eq!(lexer.next(), Some((Token::CloseParens, ")")));
assert_eq!(lexer.next(), Some((Token::MinusGreater, "->")));
assert_eq!(lexer.next(), Some((Token::F32, "f32")));
assert_eq!(lexer.next(), Some((Token::OpenBrace, "{")));
assert_eq!(lexer.next(), Some((Token::Return, "return")));
assert_eq!(lexer.next(), Some((Token::Ident, "x")));
assert_eq!(lexer.next(), Some((Token::Plus, "+")));
assert_eq!(lexer.next(), Some((Token::Ident, "y")));
assert_eq!(lexer.next(), Some((Token::Semi, ";")));
assert_eq!(lexer.next(), Some((Token::CloseBrace, "}")));
assert_eq!(lexer.next(), None);
}
}