lexer crate
This commit is contained in:
parent
fafd4011e2
commit
82d2eed09a
|
@ -30,6 +30,8 @@ werkzeug = { path = "../../rust/werkzeug" }
|
|||
[workspace.dependencies]
|
||||
unicode-xid = "0.2.4"
|
||||
tracing = "0.1.41"
|
||||
thiserror = "1.0.63"
|
||||
itertools = "0.13.0"
|
||||
|
||||
werkzeug = { path = "../../rust/werkzeug" }
|
||||
trie = { path = "../../rust/trie" }
|
|
@ -6,5 +6,7 @@ edition = "2024"
|
|||
[dependencies]
|
||||
tracing = { workspace = true }
|
||||
werkzeug = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
itertools = { workspace = true }
|
||||
trie = { workspace = true }
|
||||
unicode-xid = { workspace = true }
|
394
crates/lexer/src/complex_tokens.rs
Normal file
394
crates/lexer/src/complex_tokens.rs
Normal file
|
@ -0,0 +1,394 @@
|
|||
use crate::{Source, Token, is_things};
|
||||
use itertools::Itertools;
|
||||
use werkzeug::iter::{FallibleMapIter, NextIf};
|
||||
|
||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||
pub enum Error {
|
||||
#[error("{0}")]
|
||||
StringError(String),
|
||||
#[error("Exp part of floating constant had no digits.")]
|
||||
FloatingConstantExpPartNoDigit,
|
||||
#[error("constant cannot start with leading underscore '_'.")]
|
||||
NumericalConstantDigitLeadingUnderscore,
|
||||
#[error("Expected digit here for constant.")]
|
||||
NumericalConstantDigitNoDigit,
|
||||
#[error("Expected digit here for integer constant.")]
|
||||
IntegralTypeExpectedDigit,
|
||||
#[error("Floating constant has invalid trailing type.")]
|
||||
FloatingConstantInvalidTrailingType,
|
||||
#[error("Invalid token.")]
|
||||
InvalidToken,
|
||||
#[error("Identifier starts with invalid character.")]
|
||||
ExpectedIdStartForIdentifier,
|
||||
#[error("Unknown suffix in constant.")]
|
||||
NumericalConstantUnknownSuffix,
|
||||
}
|
||||
|
||||
type Result<T> = core::result::Result<T, Error>;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Radix {
|
||||
Hex,
|
||||
Bin,
|
||||
Dec,
|
||||
Oct,
|
||||
}
|
||||
|
||||
impl Radix {
|
||||
#[allow(unused)]
|
||||
/// must be called with one of `['b','x','d','o']`
|
||||
unsafe fn from_char_unchecked(c: char) -> Self {
|
||||
match c.to_ascii_lowercase() {
|
||||
'o' => Self::Oct,
|
||||
'b' => Self::Bin,
|
||||
'x' => Self::Hex,
|
||||
'd' => Self::Dec,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn from_char(c: char) -> Option<Self> {
|
||||
match c.to_ascii_lowercase() {
|
||||
'o' => Some(Self::Oct),
|
||||
'b' => Some(Self::Bin),
|
||||
'x' => Some(Self::Hex),
|
||||
'd' => Some(Self::Dec),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub fn radix(self) -> u8 {
|
||||
match self {
|
||||
Radix::Hex => 16,
|
||||
Radix::Bin => 2,
|
||||
Radix::Oct => 8,
|
||||
Radix::Dec => 10,
|
||||
}
|
||||
}
|
||||
fn to_token(self) -> Token {
|
||||
match self {
|
||||
Radix::Hex => Token::IntegerHexConstant,
|
||||
Radix::Bin => Token::IntegerBinConstant,
|
||||
Radix::Oct => Token::IntegerOctConstant,
|
||||
Radix::Dec => Token::IntegerConstant,
|
||||
}
|
||||
}
|
||||
pub fn from_token(token: Token) -> Option<Self> {
|
||||
match token {
|
||||
Token::IntegerHexConstant => Some(Radix::Hex),
|
||||
Token::IntegerBinConstant => Some(Radix::Bin),
|
||||
Token::IntegerOctConstant => Some(Radix::Oct),
|
||||
Token::IntegerConstant => Some(Radix::Dec),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub fn map_digit(self, c: char) -> u8 {
|
||||
match self {
|
||||
Radix::Hex => match c {
|
||||
'0'..='9' => c as u8 - b'0',
|
||||
'a'..='f' => 10 + c as u8 - b'a',
|
||||
'A'..='F' => 10 + c as u8 - b'A',
|
||||
_ => unreachable!(),
|
||||
},
|
||||
Radix::Bin => match c {
|
||||
'0'..='1' => c as u8 - b'0',
|
||||
_ => unreachable!(),
|
||||
},
|
||||
Radix::Dec => match c {
|
||||
'0'..='9' => c as u8 - b'0',
|
||||
_ => unreachable!(),
|
||||
},
|
||||
Radix::Oct => match c {
|
||||
'0'..='7' => c as u8 - b'0',
|
||||
_ => unreachable!(),
|
||||
},
|
||||
}
|
||||
}
|
||||
pub fn folding_method(self) -> fn(u64, char) -> u64 {
|
||||
match self {
|
||||
Radix::Hex => {
|
||||
fn fold(acc: u64, c: char) -> u64 {
|
||||
let digit = match c {
|
||||
'0'..='9' => c as u8 - b'0',
|
||||
'a'..='f' => c as u8 - b'a',
|
||||
'A'..='F' => c as u8 - b'A',
|
||||
_ => unreachable!(),
|
||||
};
|
||||
acc * 16 + digit as u64
|
||||
}
|
||||
fold
|
||||
}
|
||||
Radix::Bin => {
|
||||
fn fold(acc: u64, c: char) -> u64 {
|
||||
let digit = match c {
|
||||
'0'..='1' => c as u8 - b'0',
|
||||
_ => unreachable!(),
|
||||
};
|
||||
acc * 2 + digit as u64
|
||||
}
|
||||
fold
|
||||
}
|
||||
Radix::Dec => {
|
||||
fn fold(acc: u64, c: char) -> u64 {
|
||||
let digit = match c {
|
||||
'0'..='9' => c as u8 - b'0',
|
||||
_ => unreachable!(),
|
||||
};
|
||||
acc * 10 + digit as u64
|
||||
}
|
||||
fold
|
||||
}
|
||||
Radix::Oct => {
|
||||
fn fold(acc: u64, c: char) -> u64 {
|
||||
let digit = match c {
|
||||
'0'..='7' => c as u8 - b'0',
|
||||
_ => unreachable!(),
|
||||
};
|
||||
acc * 8 + digit as u64
|
||||
}
|
||||
fold
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn is_digit(self) -> fn(char) -> bool {
|
||||
match self {
|
||||
Radix::Hex => crate::is_things::is_hex_digit,
|
||||
Radix::Bin => crate::is_things::is_bin_digit,
|
||||
Radix::Oct => crate::is_things::is_oct_digit,
|
||||
Radix::Dec => crate::is_things::is_digit,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// where DIGIT is defined by radix:
|
||||
// DIGITS <-
|
||||
// if allow_leading_underscore: `_`* DIGIT (DIGIT|`_`)*
|
||||
// else: DIGIT (DIGIT|`_`)*
|
||||
fn parse_digit_part(
|
||||
source: &mut Source,
|
||||
allow_leading_underscore: bool,
|
||||
radix: Radix,
|
||||
) -> Result<()> {
|
||||
let is_digit = radix.is_digit();
|
||||
|
||||
if allow_leading_underscore {
|
||||
let _underscore = source.take_while_ref(|&c| c == '_').count();
|
||||
}
|
||||
let _need_digit = source.next_if(|&c| is_digit(c)).ok_or_else(|| {
|
||||
if source.peek() == Some(&'_') {
|
||||
Error::NumericalConstantDigitLeadingUnderscore
|
||||
} else {
|
||||
Error::NumericalConstantDigitNoDigit
|
||||
}
|
||||
})?;
|
||||
let _rest = source.take_while_ref(|&c| is_digit(c) || c == '_').count();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// IntegralType <-
|
||||
// ( 'u' | 'i' ) DIGITS+
|
||||
fn try_parse_integral_type(source: &mut Source) -> Result<Option<()>> {
|
||||
if !source.next_if(|&c| c == 'u' || c == 'i').is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if source
|
||||
.take_while_ref(|&c| crate::is_things::is_digit(c))
|
||||
.count()
|
||||
<= 0
|
||||
{
|
||||
return Err(Error::IntegralTypeExpectedDigit);
|
||||
};
|
||||
|
||||
Ok(Some(()))
|
||||
}
|
||||
|
||||
// returns `Err(E)` if it failed to parse.
|
||||
// returns `Ok(None)` if no exp part was found.
|
||||
// returns `Ok(Some(()))` if an exp part was found and parsed.
|
||||
//
|
||||
// EXP_PART <-
|
||||
// (`e`|`E`) (`-`|`+`)? DEC_DIGITS
|
||||
fn try_parse_exp_part(source: &mut Source) -> Result<Option<()>> {
|
||||
if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() {
|
||||
let _sign = source.next_if(|&c| c == '-' || c == '+');
|
||||
if source
|
||||
.take_while_ref(|&c| crate::is_things::is_digit(c))
|
||||
.count()
|
||||
.lt(&1)
|
||||
{
|
||||
// need digits following exp notation
|
||||
Err(Error::FloatingConstantExpPartNoDigit)
|
||||
} else {
|
||||
Ok(Some(()))
|
||||
}
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
// CONSTANT <-
|
||||
// DEC_DIGITS IntegralType?
|
||||
// `0x` HEX_DIGITS IntegralType?
|
||||
// `0b` BIN_DIGITS IntegralType?
|
||||
// `0o` OCT_DIGITS IntegralType?
|
||||
// DEC_DIGITS FloatingType?
|
||||
// `.` DEC_DIGITS EXP_PART? FloatingType?
|
||||
// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
|
||||
fn parse_constant_inner(source: &mut Source) -> Result<Token> {
|
||||
let zero = source.next_if(|&c| c == '0').is_some();
|
||||
|
||||
let radix = zero
|
||||
.then(|| source.next_if_map(|c| Radix::from_char(c)))
|
||||
.flatten();
|
||||
|
||||
if let Some(radix) = radix {
|
||||
parse_digit_part(source, false, radix)?;
|
||||
if source.peek().map(|&c| c == 'u' || c == 'i') == Some(true) {
|
||||
try_parse_integral_type(source)?;
|
||||
}
|
||||
return Ok(radix.to_token());
|
||||
}
|
||||
|
||||
// if zero: `_`* DIGIT (DIGIT|`_`)*
|
||||
// else: DIGIT (DIGIT|`_`)*
|
||||
_ = match parse_digit_part(source, zero, Radix::Dec) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(Error::NumericalConstantDigitNoDigit) if zero => Ok(()),
|
||||
Err(e) => Err(e),
|
||||
}?;
|
||||
|
||||
if let Some(_) = source.try_map_iter_if(|source| try_parse_integral_type(source))? {
|
||||
return Ok(Token::IntegerConstant);
|
||||
}
|
||||
|
||||
let dot = source.next_if(|&c| c == '.').is_some();
|
||||
|
||||
if dot {
|
||||
parse_digit_part(source, false, Radix::Dec)?;
|
||||
}
|
||||
|
||||
// parse exp notation
|
||||
let exp = try_parse_exp_part(source)?.is_some();
|
||||
|
||||
// trailing FloatingType?
|
||||
let floating = if source.next_if(|&c| c == 'f').is_some() {
|
||||
let digits = source.next_tuple::<(char, char)>();
|
||||
if !(digits == Some(('6', '4')) || digits == Some(('3', '2'))) {
|
||||
// need either f64 or f32 here!
|
||||
return Err(Error::FloatingConstantInvalidTrailingType);
|
||||
}
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let token = match (dot, exp, floating) {
|
||||
(false, false, false) => Token::IntegerConstant,
|
||||
(true, false, _) => Token::DotFloatingConstant,
|
||||
(true, true, _) => Token::DotFloatingExpConstant,
|
||||
(false, true, _) => Token::FloatingExpConstant,
|
||||
(false, _, _) => Token::FloatingConstant,
|
||||
};
|
||||
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
pub(crate) fn parse_constant(source: &mut Source) -> Result<Token> {
|
||||
let constant = parse_constant_inner(source)?;
|
||||
// char following a constant must not be id_continue
|
||||
if source
|
||||
.peek()
|
||||
.map(|&c| crate::is_things::is_id_continue(c))
|
||||
.unwrap_or(false)
|
||||
{
|
||||
return Err(Error::NumericalConstantUnknownSuffix);
|
||||
}
|
||||
|
||||
Ok(constant)
|
||||
}
|
||||
|
||||
pub(crate) fn parse_string_or_char_constant(source: &mut Source) -> Result<Token> {
|
||||
let quote = source
|
||||
.next_if(|&c| c == '"' || c == '\'')
|
||||
.ok_or(Error::InvalidToken)?;
|
||||
|
||||
let is_char = quote == '\'';
|
||||
|
||||
let mut escaped = false;
|
||||
let mut closed = false;
|
||||
|
||||
while let Some(c) = source.next() {
|
||||
if escaped {
|
||||
// accept any escaped char
|
||||
escaped = false;
|
||||
continue;
|
||||
}
|
||||
if c == '\\' {
|
||||
escaped = true;
|
||||
continue;
|
||||
}
|
||||
if c == quote {
|
||||
closed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !closed {
|
||||
return Err(Error::StringError("Unterminated string/char.".into()));
|
||||
}
|
||||
|
||||
if is_char {
|
||||
Ok(Token::CharConstant)
|
||||
} else {
|
||||
Ok(Token::StringConstant)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
fn make_source(s: &'_ str) -> Source<'_> {
|
||||
s.chars().peekable().into()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_constant_number() {
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("0x1A3F_u32")),
|
||||
Ok(Token::IntegerHexConstant)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("13f32")),
|
||||
Ok(Token::FloatingConstant)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("0b1011_0010i16")),
|
||||
Ok(Token::IntegerBinConstant)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("0o755u8")),
|
||||
Ok(Token::IntegerOctConstant)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("42i64")),
|
||||
Ok(Token::IntegerConstant)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("3.14f64")),
|
||||
Ok(Token::DotFloatingConstant)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("2.71828e0f32")),
|
||||
Ok(Token::DotFloatingExpConstant)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_constant(&mut make_source("22e23")),
|
||||
Ok(Token::FloatingExpConstant)
|
||||
);
|
||||
}
|
||||
}
|
|
@ -38,14 +38,14 @@ mod is_things {
|
|||
/// a formal definition of valid identifier name.
|
||||
pub fn is_id_start(c: char) -> bool {
|
||||
// This is XID_Start OR '_' (which formally is not a XID_Start).
|
||||
c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
|
||||
c == '_' || c == '-' || unicode_xid::UnicodeXID::is_xid_start(c)
|
||||
}
|
||||
|
||||
/// True if `c` is valid as a non-first character of an identifier.
|
||||
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
|
||||
/// a formal definition of valid identifier name.
|
||||
pub fn is_id_continue(c: char) -> bool {
|
||||
unicode_xid::UnicodeXID::is_xid_continue(c)
|
||||
unicode_xid::UnicodeXID::is_xid_continue(c) || c == '-'
|
||||
}
|
||||
|
||||
/// The passed string is lexically an identifier.
|
||||
|
@ -58,22 +58,27 @@ mod is_things {
|
|||
}
|
||||
}
|
||||
|
||||
#[expect(dead_code)]
|
||||
pub fn is_digit(ch: char) -> bool {
|
||||
('0'..='9').contains(&ch)
|
||||
}
|
||||
|
||||
#[expect(dead_code)]
|
||||
pub fn is_bin_digit(ch: char) -> bool {
|
||||
ch == '0' || ch == '1'
|
||||
}
|
||||
|
||||
#[expect(dead_code)]
|
||||
pub fn is_nonzero_digit(ch: char) -> bool {
|
||||
('1'..='9').contains(&ch)
|
||||
}
|
||||
|
||||
#[expect(dead_code)]
|
||||
pub fn is_oct_digit(ch: char) -> bool {
|
||||
('0'..='7').contains(&ch)
|
||||
}
|
||||
|
||||
#[expect(dead_code)]
|
||||
pub fn is_hex_digit(ch: char) -> bool {
|
||||
('0'..='9').contains(&ch) || ('a'..='f').contains(&ch) || ('A'..='F').contains(&ch)
|
||||
}
|
||||
|
@ -119,6 +124,11 @@ macro_rules! tokens {
|
|||
self.lexeme().map(|lexeme|lexeme.chars().count()).unwrap_or(0)
|
||||
}
|
||||
|
||||
/// returns the number of chars in this lexeme
|
||||
$vis fn lexeme_len_utf8(&self) -> usize {
|
||||
self.lexeme().map(|lexeme|lexeme.len()).unwrap_or(0)
|
||||
}
|
||||
|
||||
$vis fn maybe_ident(&self) -> bool {
|
||||
self.lexeme().map(|lexeme| crate::is_things::is_ident(lexeme)).unwrap_or(false)
|
||||
}
|
||||
|
@ -149,16 +159,15 @@ tokens!(pub Token: {
|
|||
DotFloatingConstant,
|
||||
DotFloatingExpConstant,
|
||||
StringConstant,
|
||||
IntegralType,
|
||||
Ident
|
||||
},
|
||||
// Lexical Tokens:
|
||||
{
|
||||
SlashSlash => "//",
|
||||
SlashStar => "/*",
|
||||
// SlashStarStar => "/**",
|
||||
StarSlash => "*/",
|
||||
// SlashSlashSlash => "///",
|
||||
SlashSlashSlash => "///",
|
||||
// SlashStar => "/*",
|
||||
// SlashStarStar => "/**",
|
||||
//StarSlash => "*/",
|
||||
// Punctuation:
|
||||
OpenParens => "(",
|
||||
CloseParens => ")",
|
||||
|
@ -445,15 +454,59 @@ impl LexemeParser {
|
|||
}
|
||||
}
|
||||
|
||||
use trie::{OnceAndIter, Tree};
|
||||
use itertools::Itertools;
|
||||
use trie::Tree;
|
||||
|
||||
pub struct LexemeIterator<I: Iterator<Item = char>> {
|
||||
trie: Tree<char, Token>,
|
||||
iter: OnceAndIter<I, char>,
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
struct CountingIterator<I: Iterator> {
|
||||
iter: I,
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> LexemeIterator<I> {
|
||||
pub fn new(iter: I) -> Self {
|
||||
impl<I: Iterator> From<I> for CountingIterator<I> {
|
||||
fn from(iter: I) -> Self {
|
||||
Self { iter, count: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for CountingIterator<I> {
|
||||
type Item = I::Item;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.iter.next().inspect(|c| self.count += c.len_utf8())
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator> CountingIterator<I> {
|
||||
pub(crate) fn offset(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator> core::ops::Deref for CountingIterator<I> {
|
||||
type Target = I;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.iter
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator> core::ops::DerefMut for CountingIterator<I> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.iter
|
||||
}
|
||||
}
|
||||
|
||||
type Source<'a> = CountingIterator<core::iter::Peekable<core::str::Chars<'a>>>;
|
||||
|
||||
pub struct TokenIterator<'a> {
|
||||
trie: Tree<char, Token>,
|
||||
source: &'a str,
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenIterator<'a> {
|
||||
pub fn new(source: &'a str) -> Self {
|
||||
let mut trie = Tree::new();
|
||||
|
||||
for (token, token_str) in Token::lexemes() {
|
||||
|
@ -462,63 +515,172 @@ impl<I: Iterator<Item = char>> LexemeIterator<I> {
|
|||
|
||||
Self {
|
||||
trie,
|
||||
iter: iter.into(),
|
||||
source,
|
||||
offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn peekable_source(&self) -> Source<'a> {
|
||||
CountingIterator::from(self.source[self.offset..].chars().peekable())
|
||||
}
|
||||
|
||||
fn parse(&mut self) -> Option<Token> {
|
||||
match self.trie.get_closest(&mut self.iter) {
|
||||
Some((Some(key), token)) => {
|
||||
let mut iter = CountingIterator::from(self.source[self.offset..].chars());
|
||||
|
||||
match self.trie.get_closest(&mut iter) {
|
||||
Some(token) => {
|
||||
// skip the peeked item
|
||||
self.iter.set_once(key);
|
||||
self.offset += token.lexeme_len();
|
||||
Some(*token)
|
||||
}
|
||||
Some((None, token)) => Some(*token),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn skip_whitespaces(&mut self) {
|
||||
loop {
|
||||
let Some(c) = self.iter.next() else { break };
|
||||
fn skip_whitespaces(&mut self) -> usize {
|
||||
self.skip_while(is_things::is_whitespace)
|
||||
}
|
||||
|
||||
if is_things::is_whitespace(c) {
|
||||
fn skip(&mut self, mut n: usize) -> usize {
|
||||
self.skip_while(|_| {
|
||||
n -= 1;
|
||||
n > 0
|
||||
})
|
||||
}
|
||||
|
||||
fn skip_while(&mut self, mut pred: impl FnMut(char) -> bool) -> usize {
|
||||
let mut count = 0;
|
||||
loop {
|
||||
let Some(c) = self.source[self.offset..].chars().next() else {
|
||||
break;
|
||||
};
|
||||
|
||||
if pred(c) {
|
||||
self.offset += c.len_utf8();
|
||||
count += c.len_utf8();
|
||||
continue;
|
||||
} else {
|
||||
self.iter.set_once(c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for LexemeIterator<I> {
|
||||
type Item = Token;
|
||||
impl<'a> Iterator for TokenIterator<'a> {
|
||||
type Item = (Token, &'a str);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// skip whitespace
|
||||
self.skip_whitespaces();
|
||||
|
||||
self.parse()
|
||||
let start = self.offset;
|
||||
|
||||
let mut source = self.peekable_source();
|
||||
let mut cursor = self.peekable_source();
|
||||
let token = match cursor.next() {
|
||||
Some('0'..='9') => {
|
||||
let token = complex_tokens::parse_constant(&mut source).ok()?;
|
||||
self.offset += source.offset();
|
||||
|
||||
Some((token, &self.source[start..self.offset]))
|
||||
}
|
||||
Some('.') if cursor.next().map_or(false, is_things::is_digit) => {
|
||||
let token = complex_tokens::parse_constant(&mut source).ok()?;
|
||||
self.offset += source.offset();
|
||||
|
||||
Some((token, &self.source[start..self.offset]))
|
||||
}
|
||||
Some('\'' | '"') => {
|
||||
let token = complex_tokens::parse_string_or_char_constant(&mut source).ok()?;
|
||||
self.offset += source.offset();
|
||||
|
||||
Some((token, &self.source[start..self.offset]))
|
||||
}
|
||||
_ => match self.parse().map(|tok| match tok {
|
||||
Token::SlashSlash => {
|
||||
self.skip_while(|c| c == '\n');
|
||||
(Token::Comment)
|
||||
}
|
||||
Token::SlashSlashSlash => {
|
||||
self.skip_while(|c| c == '\n');
|
||||
(Token::DocComment)
|
||||
}
|
||||
_ => tok,
|
||||
}) {
|
||||
Some(tok) => {
|
||||
if tok.maybe_ident() && self.skip_while(|c| is_things::is_id_continue(c)) > 0 {
|
||||
Some((Token::Ident, &self.source[start..self.offset]))
|
||||
} else {
|
||||
Some((tok, &self.source[start..self.offset]))
|
||||
}
|
||||
}
|
||||
None => {
|
||||
if self
|
||||
.peekable_source()
|
||||
.next()
|
||||
.map_or(false, |c| is_things::is_id_start(c))
|
||||
{
|
||||
self.skip(1);
|
||||
self.skip_while(|c| is_things::is_id_continue(c));
|
||||
Some((Token::Ident, &self.source[start..self.offset]))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
token
|
||||
}
|
||||
}
|
||||
|
||||
mod complex_tokens;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_iterator() {
|
||||
let mut tokens = "fn let void+++(++bool)".chars();
|
||||
let mut lexer = LexemeIterator::new(&mut tokens);
|
||||
assert_eq!(lexer.next(), Some(Token::Fn));
|
||||
assert_eq!(lexer.next(), Some(Token::Let));
|
||||
assert_eq!(lexer.next(), Some(Token::Void));
|
||||
assert_eq!(lexer.next(), Some(Token::PlusPlus));
|
||||
assert_eq!(lexer.next(), Some(Token::Plus));
|
||||
assert_eq!(lexer.next(), Some(Token::OpenParens));
|
||||
assert_eq!(lexer.next(), Some(Token::PlusPlus));
|
||||
assert_eq!(lexer.next(), Some(Token::Bool));
|
||||
assert_eq!(lexer.next(), Some(Token::CloseParens));
|
||||
let tokens = "fn let void+++(++bool)";
|
||||
let mut lexer = TokenIterator::new(&tokens);
|
||||
assert_eq!(lexer.next(), Some((Token::Fn, "fn")));
|
||||
assert_eq!(lexer.next(), Some((Token::Let, "let")));
|
||||
assert_eq!(lexer.next(), Some((Token::Void, "void")));
|
||||
assert_eq!(lexer.next(), Some((Token::PlusPlus, "++")));
|
||||
assert_eq!(lexer.next(), Some((Token::Plus, "+")));
|
||||
assert_eq!(lexer.next(), Some((Token::OpenParens, "(")));
|
||||
assert_eq!(lexer.next(), Some((Token::PlusPlus, "++")));
|
||||
assert_eq!(lexer.next(), Some((Token::Bool, "bool")));
|
||||
assert_eq!(lexer.next(), Some((Token::CloseParens, ")")));
|
||||
assert_eq!(lexer.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn complex_iterator() {
|
||||
let tokens = "fn my-function(x: i32, y: f32) -> f32 { return x + y; }";
|
||||
let mut lexer = TokenIterator::new(&tokens);
|
||||
assert_eq!(lexer.next(), Some((Token::Fn, "fn")));
|
||||
assert_eq!(lexer.next(), Some((Token::Ident, "my-function")));
|
||||
assert_eq!(lexer.next(), Some((Token::OpenParens, "(")));
|
||||
assert_eq!(lexer.next(), Some((Token::Ident, "x")));
|
||||
assert_eq!(lexer.next(), Some((Token::Colon, ":")));
|
||||
assert_eq!(lexer.next(), Some((Token::Ident, "i32")));
|
||||
assert_eq!(lexer.next(), Some((Token::Comma, ",")));
|
||||
assert_eq!(lexer.next(), Some((Token::Ident, "y")));
|
||||
assert_eq!(lexer.next(), Some((Token::Colon, ":")));
|
||||
assert_eq!(lexer.next(), Some((Token::F32, "f32")));
|
||||
assert_eq!(lexer.next(), Some((Token::CloseParens, ")")));
|
||||
assert_eq!(lexer.next(), Some((Token::MinusGreater, "->")));
|
||||
assert_eq!(lexer.next(), Some((Token::F32, "f32")));
|
||||
assert_eq!(lexer.next(), Some((Token::OpenBrace, "{")));
|
||||
assert_eq!(lexer.next(), Some((Token::Return, "return")));
|
||||
assert_eq!(lexer.next(), Some((Token::Ident, "x")));
|
||||
assert_eq!(lexer.next(), Some((Token::Plus, "+")));
|
||||
assert_eq!(lexer.next(), Some((Token::Ident, "y")));
|
||||
assert_eq!(lexer.next(), Some((Token::Semi, ";")));
|
||||
assert_eq!(lexer.next(), Some((Token::CloseBrace, "}")));
|
||||
assert_eq!(lexer.next(), None);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue