297 lines
8.4 KiB
Rust
297 lines
8.4 KiB
Rust
macro_rules! tokens {
|
|
($vis:vis $ty_name:ident:
|
|
{
|
|
$($name2:ident),*
|
|
},
|
|
{
|
|
$($name:ident => $lexeme:literal),*
|
|
}) => {
|
|
|
|
#[allow(dead_code)]
|
|
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
|
|
$vis enum $ty_name {
|
|
$($name,
|
|
)*
|
|
$($name2,)*
|
|
}
|
|
|
|
impl std::fmt::Display for $ty_name {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
match self {
|
|
$(Self::$name => write!(f, "{}", $lexeme),)*
|
|
$(Self::$name2 => write!(f, "<{}>", stringify!($name2))),*
|
|
}
|
|
}
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
impl $ty_name {
|
|
$vis fn lexeme(&self) -> Option<&'static str> {
|
|
match self {
|
|
$(Self::$name => Some($lexeme),)*
|
|
$(Self::$name2 => None),*
|
|
}
|
|
}
|
|
|
|
/// returns the number of chars in this lexeme
|
|
$vis fn lexeme_len(&self) -> usize {
|
|
self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
|
|
}
|
|
|
|
$vis fn maybe_ident(&self) -> bool {
|
|
self.lexeme().map(|lexeme| crate::common::is_ident(lexeme)).unwrap_or(false)
|
|
}
|
|
|
|
$vis fn lexemes() -> &'static [(Self, &'static str)] {
|
|
&[
|
|
$((Self::$name, $lexeme)),*
|
|
]
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
tokens!(pub Token: {
|
|
Eof,
|
|
ParseError,
|
|
// Marker Token for any Comment
|
|
Comment,
|
|
DocComment,
|
|
// Marker Token for any pre-processing directive
|
|
CharConstant,
|
|
IntegerConstant,
|
|
IntegerHexConstant,
|
|
IntegerBinConstant,
|
|
IntegerOctConstant,
|
|
FloatingConstant,
|
|
FloatingExpConstant,
|
|
DotFloatingConstant,
|
|
DotFloatingExpConstant,
|
|
StringConstant,
|
|
IntegralType,
|
|
Ident
|
|
},
|
|
// Lexical Tokens:
|
|
{
|
|
SlashSlash => "//",
|
|
SlashStar => "/*",
|
|
SlashStarStar => "/**",
|
|
StarSlash => "*/",
|
|
SlashSlashSlash => "///",
|
|
// Punctuation:
|
|
OpenParens => "(",
|
|
CloseParens => ")",
|
|
OpenBrace => "{",
|
|
CloseBrace => "}",
|
|
OpenSquareBracket => "[",
|
|
CloseSquareBracket => "]",
|
|
Semi => ";",
|
|
Comma => ",",
|
|
Elipsis3 => "...",
|
|
Elipsis2 => "..",
|
|
Colon => ":",
|
|
Equal => "=",
|
|
// Keywords:
|
|
Void => "void",
|
|
Bool => "bool",
|
|
F32 => "f32",
|
|
F64 => "f64",
|
|
Const => "const",
|
|
Fn => "fn",
|
|
Let => "let",
|
|
Var => "var",
|
|
If => "if",
|
|
As => "as",
|
|
Else => "else",
|
|
Return => "return",
|
|
// Operators
|
|
Dot => ".",
|
|
MinusGreater => "->",
|
|
Bang => "!",
|
|
Tilde => "~",
|
|
Plus => "+",
|
|
Minus => "-",
|
|
Star => "*",
|
|
Slash => "/",
|
|
Percent => "%",
|
|
Less => "<",
|
|
Greater => ">",
|
|
LessEqual => "<=",
|
|
GreaterEqual => ">=",
|
|
EqualEqual => "==",
|
|
BangEqual => "!=",
|
|
PipePipe => "||",
|
|
AmpersandAmpersand => "&&",
|
|
Ampersand => "&",
|
|
Caret => "^",
|
|
Pipe => "|",
|
|
LessLess => "<<",
|
|
GreaterGreater => ">>",
|
|
Question => "?",
|
|
PlusEqual => "+=",
|
|
MinusEqual => "-=",
|
|
StarEqual => "*=",
|
|
SlashEqual => "/=",
|
|
PercentEqual => "%=",
|
|
AmpersandEqual => "&=",
|
|
PipeEqual => "|=",
|
|
CaretEqual => "^=",
|
|
LessLessEqual => "<<=",
|
|
GreaterGreaterEqual => ">>="
|
|
});
|
|
|
|
impl Token {
|
|
pub fn is_assignment_op(self) -> bool {
|
|
match self {
|
|
Token::PlusEqual
|
|
| Token::MinusEqual
|
|
| Token::StarEqual
|
|
| Token::SlashEqual
|
|
| Token::PercentEqual
|
|
| Token::PipeEqual
|
|
| Token::CaretEqual
|
|
| Token::AmpersandEqual
|
|
| Token::LessLessEqual
|
|
| Token::GreaterGreaterEqual
|
|
| Token::Equal => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
pub fn is_unary_op(self) -> bool {
|
|
match self {
|
|
Token::Plus | Token::Minus | Token::Star | Token::Ampersand | Token::Bang => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
pub fn is_binary_op(self) -> bool {
|
|
match self {
|
|
Token::Star
|
|
| Token::Slash
|
|
| Token::Percent
|
|
| Token::Pipe
|
|
| Token::Ampersand
|
|
| Token::Caret
|
|
| Token::Plus
|
|
| Token::Minus
|
|
| Token::PipePipe
|
|
| Token::AmpersandAmpersand
|
|
| Token::BangEqual
|
|
| Token::EqualEqual
|
|
| Token::Less
|
|
| Token::Greater
|
|
| Token::LessEqual
|
|
| Token::GreaterEqual
|
|
| Token::LessLess
|
|
| Token::GreaterGreater => true,
|
|
_ => false,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Helper type for parsing tokens that have a defined lexeme, such as `fn`,
|
|
/// `f32`, `const`, etc. Tokens with variable lexemes, such as primitive
|
|
/// integral types, constants or identifiers are not parsed by this.
|
|
pub struct LexemeParser {
|
|
lexemes: Vec<Token>,
|
|
candidates: Vec<Token>,
|
|
len: usize,
|
|
}
|
|
|
|
impl LexemeParser {
|
|
pub fn new() -> Self {
|
|
let lexemes = Token::lexemes()
|
|
.iter()
|
|
.map(|(tok, _)| tok.clone())
|
|
.collect::<Vec<_>>();
|
|
|
|
Self {
|
|
lexemes,
|
|
candidates: vec![],
|
|
len: 0,
|
|
}
|
|
}
|
|
|
|
pub fn finish(mut self) -> Option<Token> {
|
|
self.candidates.pop()
|
|
}
|
|
|
|
pub fn parse(mut tokens: impl Iterator<Item = char>) -> Option<Token> {
|
|
let mut this = Self::new();
|
|
loop {
|
|
let Some(ch) = tokens.next() else {
|
|
break;
|
|
};
|
|
|
|
if crate::common::is_whitespace(ch) {
|
|
break;
|
|
}
|
|
|
|
match this.advance(ch)? {
|
|
None => {}
|
|
Some(token) => {
|
|
return Some(token);
|
|
}
|
|
}
|
|
}
|
|
this.finish()
|
|
}
|
|
|
|
/// Accepts a `char` and returns `Some(None)` until it is done trying to parse the longest lexeme.
|
|
/// If no more potential matches are available, returns the longest matched token as `Some(Token)`, or `None` on failure.
|
|
|
|
/// accepts a char and returns `None` until it is done trying to parse the longest `Token`.
|
|
/// when finished, returns a Token, if it parsed one, or `Some(None)`.
|
|
pub fn advance(&mut self, ch: char) -> Option<Option<Token>> {
|
|
self.len += 1;
|
|
|
|
// advance match
|
|
// keep tokens whose lexemes match the next char
|
|
self.lexemes.retain(|tok| {
|
|
// SAFETY: all of these tokens are lexical, and every character in
|
|
// them is represented by a single byte and we know they must be
|
|
// utf8/ascii.
|
|
unsafe {
|
|
char::from_u32_unchecked(tok.lexeme().unwrap().as_bytes()[self.len - 1] as u32)
|
|
== ch
|
|
}
|
|
});
|
|
|
|
// A token has been successfully matched completely if it has not yet
|
|
// been removed from the lexeme list but the length of it's lexeme is no
|
|
// greater than the number of chars we've received.
|
|
self.candidates.extend(self.lexemes.extract_if(|tok| {
|
|
// SAFETY: as above, all of the tokens in self.lexemes are
|
|
// lexical and are all single byte characters.
|
|
tok.lexeme().unwrap().as_bytes().len() <= self.len
|
|
}));
|
|
|
|
// we prefer the longer match
|
|
// that means that a+++++b doesn't parse and a+++(++b) is a++ + ++b
|
|
// `&&i` is also LogicalAnd i and not Ampersand Ampersand i
|
|
// Somehow, this is also a gnu extension...
|
|
|
|
if self.lexemes.is_empty() {
|
|
// return match, if it exists
|
|
return match self.candidates.pop() {
|
|
Some(token) => Some(Some(token)),
|
|
None => None,
|
|
};
|
|
}
|
|
|
|
return Some(None);
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
pub struct TokenPos {
|
|
pub token: Token,
|
|
pub start: u32,
|
|
pub end: u32,
|
|
}
|
|
|
|
impl TokenPos {
|
|
pub fn new(token: Token, start: u32, end: u32) -> Self {
|
|
Self { token, start, end }
|
|
}
|
|
}
|