use std::fmt::Display; use crate::tokens::Token; use crate::tokens::TokenPos; use itertools::Itertools; use crate::common::FallibleParse; use crate::common::NextIf; #[derive(Debug, thiserror::Error)] pub enum Error { #[error("{0}")] StringError(String), #[error("Exp part of floating constant had no digits.")] FloatingConstantExpPartNoDigit, #[error("constant cannot start with leading underscore '_'.")] NumericalConstantDigitLeadingUnderscore, #[error("Expected digit here for constant.")] NumericalConstantDigitNoDigit, #[error("Expected digit here for integer constant.")] IntegralTypeExpectedDigit, #[error("Floating constant has invalid trailing type.")] FloatingConstantInvalidTrailingType, #[error("Invalid token.")] InvalidToken, #[error("Identifier starts with invalid character.")] ExpectedIdStartForIdentifier, #[error("Unknown suffix in constant.")] NumericalConstantUnknownSuffix, } pub type Result = core::result::Result; #[derive(Debug, Clone)] pub struct Chars<'a> { bytes: &'a [u8], offset: usize, } impl<'a> Chars<'a> { pub fn as_str(&self) -> &str { let offset = self.offset.min(self.num_bytes()); unsafe { core::str::from_utf8_unchecked(&self.bytes[offset..]) } } pub fn seek(&mut self, offset: u32) { self.offset = offset as usize; } pub fn num_bytes(&self) -> usize { self.bytes.len() } pub fn is_eof(&self) -> bool { self.offset >= self.bytes.len() } pub fn peek(&self) -> Option { self.clone().next() } pub fn position(&self) -> u32 { self.offset() as u32 } pub fn offset(&self) -> usize { self.offset } pub fn get_source_span(&self, start: u32, end: u32) -> std::ops::Range { let (start_l, start_c) = { let range = self.get_from_to(0, start); range.chars().fold((1u32, 0u32), |(line, col), c| { if c == '\n' { (line + 1, 0) } else { (line, col + 1) } }) }; let (end_l, end_c) = { let range = self.get_from_to(start, end); range.chars().fold((start_l, start_c), |(line, col), c| { if c == '\n' { (line + 1, 0) } else { (line, col + 1) } }) }; core::ops::Range { start: SourceLocation::new(start_l, start_c), end: SourceLocation::new(end_l, end_c), } } pub fn get_lines(&self, start: u32, end: u32) -> &str { let range = self.get_from_to(0, start); let start = range .char_indices() .rev() .skip_while(|&(_, c)| c != '\n') .next() .map(|(idx, c)| idx + c.len_utf8()) .unwrap_or(0); let range = self.get_from_to(end, self.num_bytes() as u32); let end = range .char_indices() .skip_while(|&(_, c)| c != '\n') .next() .map(|(idx, _)| idx as u32 + end) .unwrap_or(self.num_bytes() as u32); self.get_from_to(start as u32, end as u32) } pub fn get_range(&self, range: core::ops::Range) -> &str { unsafe { core::str::from_utf8_unchecked(&self.bytes[range.start as usize..range.end as usize]) } } pub fn get_from_to(&self, start: u32, end: u32) -> &str { unsafe { core::str::from_utf8_unchecked(&self.bytes[start as usize..end as usize]) } } fn next_char(&mut self) -> Option { let ch = self.as_str().chars().next()?; self.offset += ch.len_utf8(); Some(ch) } } impl<'a> Iterator for Chars<'a> { type Item = char; fn next(&mut self) -> Option { self.next_char() } } #[derive(Debug, Clone)] pub struct Tokenizer<'a> { source: Chars<'a>, tokens: Vec, } #[derive(Debug, Clone)] pub struct TokenIterator<'a> { tokenizer: &'a Tokenizer<'a>, offset: usize, } impl<'a> TokenIterator<'a> { pub fn expect_token(&mut self, token: Token) -> crate::parser::Result> { self.next_if(|item| item.token() == token) .ok_or(crate::parser::Error::ExpectedTokenNotFound(token)) } pub fn eat_token(&mut self, token: Token) -> Option> { self.next_if(|item| item.token() == token) } pub fn peek_token(&mut self) -> Option> { self.clone().next() } pub fn peek_token_or_err(&mut self) -> crate::parser::Result> { self.clone() .next() .ok_or(crate::parser::Error::UnexpectedEndOfTokens) } pub fn peek_expect_token(&mut self, token: Token) -> crate::parser::Result> { self.clone() .next() .ok_or(crate::parser::Error::ExpectedTokenNotFound(token)) } pub fn is_next_token(&mut self, token: Token) -> bool { self.clone().next_if(|item| item.token() == token).is_some() } pub fn is_next_token2(&mut self, token: Token) -> bool { self.clone() .skip(1) .next_if(|item| item.token() == token) .is_some() } } #[derive(Debug)] pub struct TokenItem<'a> { tokenizer: &'a Tokenizer<'a>, inner: TokenPos, } #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct SourceLocation { pub line: u32, pub column: u32, } impl Display for SourceLocation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "l:{},c:{}", self.line, self.column) } } impl SourceLocation { pub fn new(line: u32, column: u32) -> Self { Self { line, column } } pub fn squiggle_line(this: core::ops::Range, lines: &str) { let lines = lines.lines(); let squiggle_lines = this.end.line - this.start.line; for (i, line) in lines.enumerate() { println!("{line}"); let squiggle_range = { let start = if i == 0 { this.start.column } else { 0 }; let end = if i as u32 + 1 == squiggle_lines { this.end.column } else { line.len() as u32 }; start..end }; if !squiggle_range.is_empty() { for _ in 0..squiggle_range.start { print!(" "); } print!("{}", ansi_term::Colour::Red.paint("^")); for _ in squiggle_range.start..(squiggle_range.end - 1) { print!("{}", ansi_term::Colour::Red.paint("~")); } println!(); } } } } impl<'a> TokenItem<'a> { pub fn token(&self) -> Token { self.inner.token } pub fn lexeme(&self) -> &str { self.tokenizer .source .get_from_to(self.inner.start, self.inner.end) } pub fn source_location(&self) -> std::ops::Range { self.tokenizer .source .get_source_span(self.inner.start, self.inner.end) } } impl<'a> Iterator for TokenIterator<'a> { type Item = TokenItem<'a>; fn next(&mut self) -> Option { if self.offset >= self.tokenizer.tokens.len() { None } else { let index = self.offset; self.offset += 1; match self.tokenizer.tokens[index].token { Token::SlashSlash | Token::SlashSlashSlash | Token::SlashStar | Token::SlashStarStar | Token::Comment | Token::DocComment => self.next(), _ => Some(Self::Item { tokenizer: self.tokenizer, inner: self.tokenizer.tokens[index], }), } } } } macro_rules! next_or_eof { ($expr:expr) => { match $expr.next() { Some(c) => c, None => { return Ok(Token::Eof); } } }; (?$expr:expr) => { match $expr.peek() { Some(c) => c, None => { return Ok(Token::Eof); } } }; } macro_rules! residual { (ok: $expr:expr) => { match $expr { Ok(t) => t, Err(e) => { return Err(e); } } }; (none: $expr:expr) => { match $expr { Ok(Some(t)) => { return Ok(Some(t)); } Ok(val) => val, Err(e) => { return Err(e); } } }; (flatten: none: $expr:expr) => { match $expr { Ok(Some(t)) => { return Ok(t); } Ok(val) => val, Err(e) => { return Err(e); } } }; (some: $expr:expr) => { match $expr { Ok(Some(t)) => t, Ok(None) => { return Ok(None); } Err(e) => { return Err(e); } } }; } pub struct TokenizeError { pub err: Error, pub range: core::ops::Range, } impl<'a> Tokenizer<'a> { pub fn iter(&self) -> TokenIterator { TokenIterator { tokenizer: self, offset: 0, } } pub fn src(&self) -> &Chars<'a> { &self.source } pub fn new_with_errors( bytes: &'a [u8], ) -> core::result::Result)> { let mut this = Self { source: Chars { bytes, offset: 0 }, tokens: Vec::new(), }; let mut errors = Vec::new(); loop { if this.source.is_eof() { break; } let start = this.source.position(); match this.next_token() { Ok(_) => {} Err(e) => { // let is_quoted = this // .source // .get_range(start, this.source.bytes.len() as u32) // .chars() // .take_while_ref(|&c| crate::common::is_whitespace(c)) // .next() // .map(|c| c == '\'' || c == '"') // .unwrap_or(false); let end = this.source.position(); if this.source.peek().map(|c| crate::common::is_whitespace(c)) != Some(true) { this.source .take_while_ref(|&c| !crate::common::is_whitespace(c)) .count(); } _ = this.push_token(Token::ParseError, start, end); errors.push(TokenizeError { err: e, range: start..end, }); } } } if errors.is_empty() { Ok(this) } else { Err((this, errors)) } } pub fn new(bytes: &'a [u8]) -> Result> { let mut this = Self { source: Chars { bytes, offset: 0 }, tokens: Vec::new(), }; loop { if this.source.is_eof() { break; } this.next_token().map_err(|e| { eprintln!("error while tokenizing: {e}"); eprintln!( "at position {}: {}", this.source.offset(), &this.source.as_str()[..this.source.as_str().len().min(16)] ); e })?; } Ok(this) } fn push_token(&mut self, token: Token, start: u32, end: u32) -> Result<()> { self.tokens.push(TokenPos::new(token, start, end)); Ok(()) } fn next_token(&mut self) -> Result<()> { self.source .take_while_ref(|&c| crate::common::is_whitespace(c)) .count(); if self.source.is_eof() { return Ok(()); } let start = self.source.position(); let token = { let mut peeking = self.source.clone(); match peeking.next() { Some('0'..='9') => Some(parse_constant(&mut self.source)?), Some('.') if peeking.next().map(|c| crate::common::is_digit(c)) == Some(true) => { Some(parse_constant(&mut self.source)?) } _ => None, } }; if let Some(token) = token { return self.push_token(token, start, self.source.position()); } // lexical tokens let token = crate::tokens::LexemeParser::parse(self.source.clone()); if let Some(token) = token { _ = self.source.advance_by(token.lexeme_len()); match token { Token::SlashSlash | Token::SlashSlashSlash => { _ = self.push_token(token, start, self.source.position()); let start = self.source.position(); loop { // advance until either EOF or newline let Some(ch) = self.source.next() else { break; }; if ch == '\n' { break; } } let end = self.source.position() - 1; return self.push_token( if token == Token::SlashSlash { Token::Comment } else { Token::DocComment }, start, end, ); } Token::SlashStar | Token::SlashStarStar => { let start = self.source.position(); let mut end = self.source.position(); let mut last = self.source.next(); loop { // break out of loop if EOF let Some(l) = last.replace(match self.source.next() { Some(ch) => ch, None => { break; } }) else { break; }; // break out of loop if end of comment if (l, last.unwrap()) == ('*', '/') { break; } end = self.source.position() - 1; } return self.push_token( if token == Token::SlashStar { Token::Comment } else { Token::DocComment }, start, end, ); } _ => {} } if token.maybe_ident() { if self .source .take_while_ref(|&c| crate::common::is_id_continue(c)) .count() .gt(&0) { return self.push_token(Token::Ident, start, self.source.position()); } } return self.push_token(token, start, self.source.position()); } self.source .next_if(|&c| crate::common::is_id_start(c)) .ok_or(Error::ExpectedIdStartForIdentifier)?; self.source .take_while_ref(|&c| crate::common::is_id_continue(c)) .count(); return self.push_token(Token::Ident, start, self.source.position()); } } /// IntegralType <- /// ( 'u' | 'i' ) DIGITS+ fn try_parse_integral_type(source: &mut Chars) -> Result> { if !source.next_if(|&c| c == 'u' || c == 'i').is_some() { return Ok(None); } if source .take_while_ref(|&c| crate::common::is_digit(c)) .count() <= 0 { return Err(Error::IntegralTypeExpectedDigit); }; Ok(Some(())) } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Radix { Hex, Bin, Dec, Oct, } impl Radix { #[allow(unused)] /// must be called with one of `['b','x','d','o']` unsafe fn from_char_unchecked(c: char) -> Self { match c.to_ascii_lowercase() { 'o' => Self::Oct, 'b' => Self::Bin, 'x' => Self::Hex, 'd' => Self::Dec, _ => unreachable!(), } } fn from_char(c: char) -> Option { match c.to_ascii_lowercase() { 'o' => Some(Self::Oct), 'b' => Some(Self::Bin), 'x' => Some(Self::Hex), 'd' => Some(Self::Dec), _ => None, } } #[allow(unused)] fn radix(self) -> u8 { match self { Radix::Hex => 16, Radix::Bin => 2, Radix::Oct => 8, Radix::Dec => 10, } } fn to_token(self) -> Token { match self { Radix::Hex => Token::IntegerHexConstant, Radix::Bin => Token::IntegerBinConstant, Radix::Oct => Token::IntegerOctConstant, Radix::Dec => Token::IntegerConstant, } } pub fn from_token(token: Token) -> Option { match token { Token::IntegerHexConstant => Some(Radix::Hex), Token::IntegerBinConstant => Some(Radix::Bin), Token::IntegerOctConstant => Some(Radix::Oct), Token::IntegerConstant => Some(Radix::Dec), _ => None, } } pub fn folding_method(self) -> fn(u64, char) -> u64 { match self { Radix::Hex => { fn fold(acc: u64, c: char) -> u64 { let digit = match c { '0'..='9' => c as u8 - b'0', 'a'..='f' => c as u8 - b'a', 'A'..='F' => c as u8 - b'A', _ => unreachable!(), }; acc * 16 + digit as u64 } fold } Radix::Bin => { fn fold(acc: u64, c: char) -> u64 { let digit = match c { '0'..='1' => c as u8 - b'0', _ => unreachable!(), }; acc * 2 + digit as u64 } fold } Radix::Dec => { fn fold(acc: u64, c: char) -> u64 { let digit = match c { '0'..='9' => c as u8 - b'0', _ => unreachable!(), }; acc * 10 + digit as u64 } fold } Radix::Oct => { fn fold(acc: u64, c: char) -> u64 { let digit = match c { '0'..='7' => c as u8 - b'0', _ => unreachable!(), }; acc * 8 + digit as u64 } fold } } } pub fn is_digit(self) -> fn(char) -> bool { match self { Radix::Hex => crate::common::is_hex_digit, Radix::Bin => crate::common::is_bin_digit, Radix::Oct => crate::common::is_oct_digit, Radix::Dec => crate::common::is_digit, } } } /// where DIGIT is defined by radix: /// DIGITS <- /// if allow_leading_underscore: `_`* DIGIT (DIGIT|`_`)* /// else: DIGIT (DIGIT|`_`)* fn parse_digit_part( source: &mut Chars, allow_leading_underscore: bool, radix: Radix, ) -> Result<()> { let radix = radix.is_digit(); if allow_leading_underscore { let _underscore = source.take_while_ref(|&c| c == '_').count(); } let _need_digit = source.next_if(|&c| radix(c)).ok_or_else(|| { if source.peek() == Some('_') { Error::NumericalConstantDigitLeadingUnderscore } else { Error::NumericalConstantDigitNoDigit } })?; let _rest = source.take_while_ref(|&c| radix(c) || c == '_').count(); Ok(()) } /// returns `Err(E)` if it failed to parse. /// returns `Ok(None)` if no exp part was found. /// returns `Ok(Some(()))` if an exp part was found and parsed. /// /// EXP_PART <- /// (`e`|`E`) (`-`|`+`)? DEC_DIGITS fn try_parse_exp_part(source: &mut Chars) -> Result> { if source.next_if(|&c| c.to_ascii_lowercase() == 'e').is_some() { let _sign = source.next_if(|&c| c == '-' || c == '+'); if source .take_while_ref(|&c| crate::common::is_digit(c)) .count() .lt(&1) { // need digits following exp notation Err(Error::FloatingConstantExpPartNoDigit) } else { Ok(Some(())) } } else { Ok(None) } } /// CONSTANT <- /// DEC_DIGITS IntegralType? /// `0x` HEX_DIGITS IntegralType? /// `0b` BIN_DIGITS IntegralType? /// `0o` OCT_DIGITS IntegralType? /// DEC_DIGITS FloatingType? /// `.` DEC_DIGITS EXP_PART? FloatingType? /// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType? fn parse_constant_inner(source: &mut Chars) -> Result { let zero = source.next_if(|&c| c == '0').is_some(); let radix = zero .then(|| source.next_if_map(|c| Radix::from_char(c))) .flatten(); if let Some(radix) = radix { parse_digit_part(source, false, radix)?; if source.peek().map(|c| c == 'u' || c == 'i') == Some(true) { try_parse_integral_type(source)?; } return Ok(radix.to_token()); } // if zero: `_`* DIGIT (DIGIT|`_`)* // else: DIGIT (DIGIT|`_`)* _ = match parse_digit_part(source, zero, Radix::Dec) { Ok(_) => Ok(()), Err(Error::NumericalConstantDigitNoDigit) if zero => Ok(()), Err(e) => Err(e), }?; if let Ok(_) = source.try_parse_result(|source| try_parse_integral_type(source)) { return Ok(Token::IntegerConstant); } let dot = source.next_if(|&c| c == '.').is_some(); if dot { parse_digit_part(source, false, Radix::Dec)?; } // parse exp notation let exp = try_parse_exp_part(source)?.is_some(); // trailing FloatingType? let floating = if source.next_if(|&c| c == 'f').is_some() { let digits = source.next_tuple::<(char, char)>(); if !(digits == Some(('6', '4')) || digits == Some(('3', '2'))) { // need either f64 or f32 here! return Err(Error::FloatingConstantInvalidTrailingType); } true } else { false }; let token = match (dot, exp, floating) { (false, false, false) => Token::IntegerConstant, (true, false, _) => Token::DotFloatingConstant, (true, true, _) => Token::DotFloatingExpConstant, (false, true, _) => Token::FloatingExpConstant, (false, _, _) => Token::FloatingConstant, }; Ok(token) } /// CONSTANT <- /// DEC_DIGITS IntegralType? /// `0x` HEX_DIGITS IntegralType? /// `0b` BIN_DIGITS IntegralType? /// `0o` OCT_DIGITS IntegralType? /// DEC_DIGITS FloatingType? /// `.` DEC_DIGITS EXP_PART? FloatingType? /// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType? fn parse_constant(source: &mut Chars) -> Result { let constant = parse_constant_inner(source)?; // char following a constant must not be id_continue source .peek() .filter(|&c| !crate::common::is_id_continue(c)) .ok_or(Error::NumericalConstantUnknownSuffix)?; Ok(constant) }