SeaLang/src/tokens.rs
2024-09-09 15:01:53 +02:00

334 lines
9.4 KiB
Rust

use std::collections::HashMap;
macro_rules! tokens {
($vis:vis $ty_name:ident:
{
$($name2:ident),*
},
{
$($name:ident => $lexeme:literal),*
}) => {
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
$vis enum $ty_name {
$($name,
)*
$($name2,)*
}
impl std::fmt::Display for $ty_name {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
$(Self::$name => write!(f, "{}", $lexeme),)*
$(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
}
}
}
#[allow(dead_code)]
impl $ty_name {
$vis fn lexeme(&self) -> Option<&'static str> {
match self {
$(Self::$name => Some($lexeme),)*
$(Self::$name2 => None),*
}
}
/// returns the number of chars in this lexeme
$vis fn lexeme_len(&self) -> usize {
self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
}
$vis fn maybe_ident(&self) -> bool {
self.lexeme().map(|lexeme| crate::common::is_ident(lexeme)).unwrap_or(false)
}
$vis fn lexemes() -> &'static [(Self, &'static str)] {
&[
$((Self::$name, $lexeme)),*
]
}
}
};
}
tokens!(pub Token: {
Eof,
ParseError,
// Marker Token for any Comment
Comment,
DocComment,
// Marker Token for any pre-processing directive
CharConstant,
IntegerConstant,
IntegerHexConstant,
IntegerBinConstant,
IntegerOctConstant,
FloatingConstant,
FloatingExpConstant,
DotFloatingConstant,
DotFloatingExpConstant,
StringConstant,
IntegralType,
Ident
},
// Lexical Tokens:
{
SlashSlash => "//",
SlashStar => "/*",
SlashStarStar => "/**",
StarSlash => "*/",
SlashSlashSlash => "///",
// Punctuation:
OpenParens => "(",
CloseParens => ")",
OpenBrace => "{",
CloseBrace => "}",
OpenSquareBracket => "[",
CloseSquareBracket => "]",
Semi => ";",
Comma => ",",
Elipsis3 => "...",
Elipsis2 => "..",
Colon => ":",
Equal => "=",
// Keywords:
Void => "void",
Bool => "bool",
F32 => "f32",
F64 => "f64",
ISize => "isize",
USize => "usize",
Const => "const",
Volatile => "volatile",
Noalias => "noalias",
Fn => "fn",
Let => "let",
Var => "var",
If => "if",
As => "as",
Else => "else",
Return => "return",
Struct => "struct",
Type => "type",
Union => "union",
Enum => "enum",
Packed => "packed",
Extern => "extern",
Pub => "pub",
// Operators
Dot => ".",
MinusGreater => "->",
Bang => "!",
Tilde => "~",
Plus => "+",
Minus => "-",
Star => "*",
Slash => "/",
Percent => "%",
Less => "<",
Greater => ">",
LessEqual => "<=",
GreaterEqual => ">=",
EqualEqual => "==",
BangEqual => "!=",
PipePipe => "||",
AmpersandAmpersand => "&&",
Ampersand => "&",
Caret => "^",
Pipe => "|",
LessLess => "<<",
GreaterGreater => ">>",
Question => "?",
PlusEqual => "+=",
MinusEqual => "-=",
StarEqual => "*=",
SlashEqual => "/=",
PercentEqual => "%=",
AmpersandEqual => "&=",
PipeEqual => "|=",
CaretEqual => "^=",
LessLessEqual => "<<=",
GreaterGreaterEqual => ">>="
});
impl Token {
pub fn is_assignment_op(self) -> bool {
match self {
Token::PlusEqual
| Token::MinusEqual
| Token::StarEqual
| Token::SlashEqual
| Token::PercentEqual
| Token::PipeEqual
| Token::CaretEqual
| Token::AmpersandEqual
| Token::LessLessEqual
| Token::GreaterGreaterEqual
| Token::Equal => true,
_ => false,
}
}
pub fn is_unary_op(self) -> bool {
match self {
Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true,
_ => false,
}
}
pub fn is_binary_op(self) -> bool {
match self {
Token::Star
| Token::Slash
| Token::Percent
| Token::Pipe
| Token::Ampersand
| Token::Caret
| Token::Plus
| Token::Minus
| Token::PipePipe
| Token::AmpersandAmpersand
| Token::BangEqual
| Token::EqualEqual
| Token::Less
| Token::Greater
| Token::LessEqual
| Token::GreaterEqual
| Token::LessLess
| Token::GreaterGreater => true,
_ => false,
}
}
}
/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
/// integral types, constants or identifiers are not parsed by this.
pub struct LexemeParser {
lexemes: Vec<Token>,
candidates: Vec<Token>,
len: usize,
}
impl LexemeParser {
pub fn new() -> Self {
let lexemes = Token::lexemes()
.iter()
.map(|(tok, _)| tok.clone())
.collect::<Vec<_>>();
Self {
lexemes,
candidates: vec![],
len: 0,
}
}
pub fn finish(mut self) -> Option<Token> {
self.candidates.pop()
}
pub fn parse(mut tokens: impl Iterator<Item = char>) -> Option<Token> {
let mut this = Self::new();
loop {
let Some(ch) = tokens.next() else {
break;
};
if crate::common::is_whitespace(ch) {
break;
}
match this.advance(ch)? {
None => {}
Some(token) => {
return Some(token);
}
}
}
this.finish()
}
/// Accepts a `char` and returns `Some(None)` until it is done trying to parse the longest lexeme.
/// If no more potential matches are available, returns the longest matched token as `Some(Token)`, or `None` on failure.
/// accepts a char and returns `None` until it is done trying to parse the longest `Token`.
/// when finished, returns a Token, if it parsed one, or `Some(None)`.
pub fn advance(&mut self, ch: char) -> Option<Option<Token>> {
self.len += 1;
// advance match
// keep tokens whose lexemes match the next char
self.lexemes.retain(|tok| {
// SAFETY: all of these tokens are lexical, and every character in
// them is represented by a single byte and we know they must be
// utf8/ascii.
unsafe {
char::from_u32_unchecked(tok.lexeme().unwrap().as_bytes()[self.len - 1] as u32)
== ch
}
});
// A token has been successfully matched completely if it has not yet
// been removed from the lexeme list but the length of it's lexeme is no
// greater than the number of chars we've received.
self.candidates.extend(self.lexemes.extract_if(|tok| {
// SAFETY: as above, all of the tokens in self.lexemes are
// lexical and are all single byte characters.
tok.lexeme().unwrap().as_bytes().len() <= self.len
}));
// we prefer the longer match
// that means that a+++++b doesn't parse and a+++(++b) is a++ + ++b
// `&&i` is also LogicalAnd i and not Ampersand Ampersand i
// Somehow, this is also a gnu extension...
if self.lexemes.is_empty() {
// return match, if it exists
return match self.candidates.pop() {
Some(token) => Some(Some(token)),
None => None,
};
}
return Some(None);
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct TokenPos {
pub token: Token,
pub start: u32,
pub end: u32,
}
impl TokenPos {
pub fn new(token: Token, start: u32, end: u32) -> Self {
Self { token, start, end }
}
}
pub static PRECEDENCE_MAP: std::sync::LazyLock<HashMap<Token, u32>> =
std::sync::LazyLock::new(|| {
HashMap::from([
(Token::PipePipe, 10),
(Token::AmpersandAmpersand, 20),
(Token::Pipe, 30),
(Token::Caret, 40),
(Token::Ampersand, 50),
(Token::BangEqual, 60),
(Token::EqualEqual, 60),
(Token::LessEqual, 70),
(Token::GreaterEqual, 70),
(Token::Less, 70),
(Token::Greater, 70),
(Token::GreaterGreater, 80),
(Token::LessLess, 80),
(Token::Plus, 90),
(Token::Minus, 90),
(Token::Percent, 100),
(Token::Star, 100),
(Token::Slash, 100),
])
});