From 02be9bdc2630ef9b986ad9390b514ad9d9820a79 Mon Sep 17 00:00:00 2001 From: Janis Date: Fri, 9 Aug 2024 18:58:26 +0200 Subject: [PATCH] simple lexing error reporting --- Cargo.toml | 2 + src/bin/tokenizer.rs | 54 +++++++++ src/lexer.rs | 217 ++++++++++++++++++++++++++++------ src/lib.rs | 16 ++- src/parser.rs | 77 ++++++++++-- src/tokens.rs | 1 + tests/faulty/non_id_start.sea | 4 + 7 files changed, 323 insertions(+), 48 deletions(-) create mode 100644 src/bin/tokenizer.rs create mode 100644 tests/faulty/non_id_start.sea diff --git a/Cargo.toml b/Cargo.toml index 1fb4d26..39741c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,8 @@ version = "0.1.0" edition = "2021" [dependencies] +ansi_term = "0.12.1" +clap = "4.5.14" itertools = "0.13.0" log = "0.4.22" thiserror = "1.0.63" diff --git a/src/bin/tokenizer.rs b/src/bin/tokenizer.rs new file mode 100644 index 0000000..a43b028 --- /dev/null +++ b/src/bin/tokenizer.rs @@ -0,0 +1,54 @@ +use std::{io::Read, path::PathBuf}; + +use compiler::*; +use lexer::SourceLocation; + +fn main() { + let cmd = clap::Command::new("sea-tokens").bin_name("sea-tokens").arg( + clap::Arg::new("input") + .short('i') + .help("sea source file.") + .value_parser(clap::builder::PathBufValueParser::new()), + ); + + let matches = cmd.get_matches(); + let path = matches.get_one::("input"); + let source = path + .and_then(|p| std::fs::read(p).ok()) + .or_else(|| { + let mut buf = Vec::new(); + std::io::stdin().read(&mut buf).ok()?; + Some(buf) + }) + .expect("no source bytes."); + + let tokens = tokenize(&source); + match tokens { + Ok(tokens) => { + for tok in tokens.iter() { + println!("{}@[{}]", tok.token(), tok.source_location().start); + } + } + Err((tokens, errors)) => { + eprint!("{} errors while tokenizing", errors.len()); + match path { + Some(path) => { + eprint!("{}", path.display()); + } + None => { + eprint!("stdin"); + } + } + eprintln!(":"); + for error in &errors { + let lines = tokens.src().get_lines(error.range.start, error.range.end); + let location = tokens + .src() + .get_source_span(error.range.start, error.range.end); + + eprintln!("Error: {}", error.err); + SourceLocation::squiggle_line(location, lines); + } + } + } +} diff --git a/src/lexer.rs b/src/lexer.rs index 46965b3..8ee47e9 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,3 +1,5 @@ +use std::fmt::Display; + use crate::tokens::Token; use crate::tokens::TokenPos; use itertools::Itertools; @@ -11,18 +13,20 @@ pub enum Error { StringError(String), #[error("Exp part of floating constant had no digits.")] FloatingConstantExpPartNoDigit, - #[error("Dummy Message.")] + #[error("constant cannot start with leading underscore '_'.")] NumericalConstantDigitLeadingUnderscore, - #[error("Dummy Message.")] + #[error("Expected digit here for constant.")] NumericalConstantDigitNoDigit, - #[error("Dummy Message.")] + #[error("Expected digit here for integer constant.")] IntegralTypeExpectedDigit, - #[error("Dummy Message.")] + #[error("Floating constant has invalid trailing type.")] FloatingConstantInvalidTrailingType, - #[error("Dummy Message.")] + #[error("Invalid token.")] InvalidToken, - #[error("Dummy Message.")] + #[error("Identifier starts with invalid character.")] ExpectedIdStartForIdentifier, + #[error("Unknown suffix in constant.")] + NumericalConstantUnknownSuffix, } pub type Result = core::result::Result; @@ -35,11 +39,22 @@ pub struct Chars<'a> { impl<'a> Chars<'a> { pub fn as_str(&self) -> &str { - unsafe { core::str::from_utf8_unchecked(&self.bytes[self.offset..]) } + let offset = self.offset.min(self.num_bytes()); + unsafe { core::str::from_utf8_unchecked(&self.bytes[offset..]) } } + + pub fn seek(&mut self, offset: u32) { + self.offset = offset as usize; + } + + pub fn num_bytes(&self) -> usize { + self.bytes.len() + } + pub fn is_eof(&self) -> bool { self.offset >= self.bytes.len() } + pub fn peek(&self) -> Option { self.clone().next() } @@ -52,13 +67,9 @@ impl<'a> Chars<'a> { self.offset } - pub fn get_source_span( - &self, - start: u32, - end: u32, - ) -> std::ops::RangeInclusive { + pub fn get_source_span(&self, start: u32, end: u32) -> std::ops::Range { let (start_l, start_c) = { - let range = self.get_range(0, start); + let range = self.get_from_to(0, start); range.chars().fold((1u32, 0u32), |(line, col), c| { if c == '\n' { (line + 1, 0) @@ -68,7 +79,7 @@ impl<'a> Chars<'a> { }) }; let (end_l, end_c) = { - let range = self.get_range(start, end); + let range = self.get_from_to(start, end); range.chars().fold((start_l, start_c), |(line, col), c| { if c == '\n' { (line + 1, 0) @@ -78,13 +89,40 @@ impl<'a> Chars<'a> { }) }; - core::ops::RangeInclusive::new( - SourceLocation::new(start_l, start_c), - SourceLocation::new(end_l, end_c), - ) + core::ops::Range { + start: SourceLocation::new(start_l, start_c), + end: SourceLocation::new(end_l, end_c), + } } - pub fn get_range(&self, start: u32, end: u32) -> &str { + pub fn get_lines(&self, start: u32, end: u32) -> &str { + let range = self.get_from_to(0, start); + let start = range + .char_indices() + .rev() + .skip_while(|&(_, c)| c != '\n') + .next() + .map(|(idx, c)| idx + c.len_utf8()) + .unwrap_or(0); + + let range = self.get_from_to(end, self.num_bytes() as u32); + let end = range + .char_indices() + .skip_while(|&(_, c)| c != '\n') + .next() + .map(|(idx, _)| idx as u32 + end) + .unwrap_or(self.num_bytes() as u32); + + self.get_from_to(start as u32, end as u32) + } + + pub fn get_range(&self, range: core::ops::Range) -> &str { + unsafe { + core::str::from_utf8_unchecked(&self.bytes[range.start as usize..range.end as usize]) + } + } + + pub fn get_from_to(&self, start: u32, end: u32) -> &str { unsafe { core::str::from_utf8_unchecked(&self.bytes[start as usize..end as usize]) } } @@ -163,10 +201,45 @@ pub struct SourceLocation { pub column: u32, } +impl Display for SourceLocation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "l:{},c:{}", self.line, self.column) + } +} + impl SourceLocation { pub fn new(line: u32, column: u32) -> Self { Self { line, column } } + + pub fn squiggle_line(this: core::ops::Range, lines: &str) { + let lines = lines.lines(); + let squiggle_lines = this.end.line - this.start.line; + + for (i, line) in lines.enumerate() { + println!("{line}"); + let squiggle_range = { + let start = if i == 0 { this.start.column } else { 0 }; + let end = if i as u32 + 1 == squiggle_lines { + this.end.column + } else { + line.len() as u32 + }; + start..end + }; + + if !squiggle_range.is_empty() { + for _ in 0..squiggle_range.start { + print!(" "); + } + print!("{}", ansi_term::Colour::Red.paint("^")); + for _ in squiggle_range.start..(squiggle_range.end - 1) { + print!("{}", ansi_term::Colour::Red.paint("~")); + } + println!(); + } + } + } } impl<'a> TokenItem<'a> { @@ -177,10 +250,10 @@ impl<'a> TokenItem<'a> { pub fn lexeme(&self) -> &str { self.tokenizer .source - .get_range(self.inner.start, self.inner.end) + .get_from_to(self.inner.start, self.inner.end) } - pub fn source_location(&self) -> std::ops::RangeInclusive { + pub fn source_location(&self) -> std::ops::Range { self.tokenizer .source .get_source_span(self.inner.start, self.inner.end) @@ -275,6 +348,11 @@ macro_rules! residual { }; } +pub struct TokenizeError { + pub err: Error, + pub range: core::ops::Range, +} + impl<'a> Tokenizer<'a> { pub fn iter(&self) -> TokenIterator { TokenIterator { @@ -283,6 +361,61 @@ impl<'a> Tokenizer<'a> { } } + pub fn src(&self) -> &Chars<'a> { + &self.source + } + + pub fn new_with_errors( + bytes: &'a [u8], + ) -> core::result::Result)> { + let mut this = Self { + source: Chars { bytes, offset: 0 }, + tokens: Vec::new(), + }; + let mut errors = Vec::new(); + + loop { + if this.source.is_eof() { + break; + } + + let start = this.source.position(); + + match this.next_token() { + Ok(_) => {} + Err(e) => { + // let is_quoted = this + // .source + // .get_range(start, this.source.bytes.len() as u32) + // .chars() + // .take_while_ref(|&c| crate::common::is_whitespace(c)) + // .next() + // .map(|c| c == '\'' || c == '"') + // .unwrap_or(false); + let end = this.source.position(); + + if this.source.peek().map(|c| crate::common::is_whitespace(c)) != Some(true) { + this.source + .take_while_ref(|&c| !crate::common::is_whitespace(c)) + .count(); + } + + _ = this.push_token(Token::ParseError, start, end); + errors.push(TokenizeError { + err: e, + range: start..end, + }); + } + } + } + + if errors.is_empty() { + Ok(this) + } else { + Err((this, errors)) + } + } + pub fn new(bytes: &'a [u8]) -> Result> { let mut this = Self { source: Chars { bytes, offset: 0 }, @@ -326,23 +459,16 @@ impl<'a> Tokenizer<'a> { let start = self.source.position(); - let token = self.source.try_parse_result(|source| { - let a = try_parse_integral_type(source).map(|o| o.map(|_| Token::IntegralType)); - residual!(none: a); - - let mut peeking = source.clone(); + let token = { + let mut peeking = self.source.clone(); match peeking.next() { - Some('0'..='9') => { - return Ok(Some(parse_constant(source)?)); + Some('0'..='9') => Some(parse_constant(&mut self.source)?), + Some('.') if peeking.next().map(|c| crate::common::is_digit(c)) == Some(true) => { + Some(parse_constant(&mut self.source)?) } - Some('.') if peeking.next().map(|c| ['b', 'x', 'o'].contains(&c)) == Some(true) => { - return Ok(Some(parse_constant(source)?)); - } - _ => {} + _ => None, } - - Ok(None) - })?; + }; if let Some(token) = token { return self.push_token(token, start, self.source.position()); @@ -623,7 +749,7 @@ fn try_parse_exp_part(source: &mut Chars) -> Result> { /// DEC_DIGITS FloatingType? /// `.` DEC_DIGITS EXP_PART? FloatingType? /// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType? -fn parse_constant(source: &mut Chars) -> Result { +fn parse_constant_inner(source: &mut Chars) -> Result { let zero = source.next_if(|&c| c == '0').is_some(); let radix = zero .then(|| source.next_if_map(|c| Radix::from_char(c))) @@ -680,3 +806,22 @@ fn parse_constant(source: &mut Chars) -> Result { Ok(token) } + +/// CONSTANT <- +/// DEC_DIGITS IntegralType? +/// `0x` HEX_DIGITS IntegralType? +/// `0b` BIN_DIGITS IntegralType? +/// `0o` OCT_DIGITS IntegralType? +/// DEC_DIGITS FloatingType? +/// `.` DEC_DIGITS EXP_PART? FloatingType? +/// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType? +fn parse_constant(source: &mut Chars) -> Result { + let constant = parse_constant_inner(source)?; + // char following a constant must not be id_continue + source + .peek() + .filter(|&c| !crate::common::is_id_continue(c)) + .ok_or(Error::NumericalConstantUnknownSuffix)?; + + Ok(constant) +} diff --git a/src/lib.rs b/src/lib.rs index 500e683..578129b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,14 @@ #![feature(extract_if, iter_advance_by)] #![allow(dead_code, unused_macros)] -mod ast; -mod common; -mod lexer; -mod parser; -mod tokens; +pub mod ast; +pub mod common; +pub mod lexer; +pub mod parser; +pub mod tokens; + +pub fn tokenize<'a>( + bytes: &'a [u8], +) -> Result, (lexer::Tokenizer<'a>, Vec)> { + lexer::Tokenizer::new_with_errors(bytes) +} diff --git a/src/parser.rs b/src/parser.rs index 2773a2c..7b165a3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -4,6 +4,7 @@ use itertools::Itertools; use crate::{ ast::{FloatingType, IntegralType, LetOrVar, Node, PrimitiveType, Tag}, + common::NextIf, lexer::{Radix, TokenIterator}, tokens::Token, }; @@ -18,6 +19,8 @@ pub enum Error { ExpectedTokenNotFound(Token), #[error("Dummy message.")] ExpectedLetOrVar, + #[error("Dummy message.")] + IntegralTypeTooWide, } pub type Result = core::result::Result; @@ -91,6 +94,64 @@ impl Tree { node } + fn is_integral_type(lexeme: &str) -> Option<()> { + let mut iter = lexeme.chars(); + iter.next_if(|&c| c == 'u' || c == 'i')?; + iter.next_if(|&c| crate::common::is_digit(c))?; + iter.take_while_ref(|&c| crate::common::is_digit(c)).count(); + iter.next().is_none().then_some(()) + } + + // returns an option instead of a result because failure here means the + // lexeme is actually an identifier. + fn try_parse_integral_type(lexeme: &str) -> Result> { + let mut iter = lexeme.chars().peekable(); + let signed = match iter.next() { + Some('u') => false, + Some('i') => true, + _ => { + return Ok(None); + } + }; + + // need 1 digit for an integral type + if iter.peek().map(|&c| crate::common::is_digit(c)) != Some(true) { + return Ok(None); + } + + // need no nondigits after digits + if iter + .clone() + .skip_while(|&c| crate::common::is_digit(c)) + .next() + .is_some() + { + return Ok(None); + } + + let mut bits = 0u16; + loop { + let Some(digit) = iter.next().map(|c| c as u8 - b'0') else { + break; + }; + + match bits + .checked_mul(10) + .and_then(|bits| bits.checked_add(digit as u16)) + { + Some(val) => { + bits = val; + } + None => { + // this IS an integral type, but it is bigger than u/i65535 + return Err(Error::IntegralTypeTooWide); + } + } + } + + Ok(Some(IntegralType { signed, bits })) + } + /// returns (signed, bits) fn parse_integral_type(lexeme: &str) -> IntegralType { let mut iter = lexeme.chars(); @@ -164,10 +225,6 @@ impl Tree { pub fn parse_primitive_type(&mut self, tokens: &mut TokenIterator) -> Result { let token = tokens.next().ok_or(Error::UnexpectedEndOfTokens)?; let prim = match token.token() { - Token::IntegralType => { - let int = Self::parse_integral_type(token.lexeme()); - return Ok(self.push_tag(Tag::IntegralType(int))); - } Token::Void => PrimitiveType::Void, Token::Bool => PrimitiveType::Bool, Token::F32 => PrimitiveType::FloatingType(FloatingType::Binary32), @@ -191,9 +248,15 @@ impl Tree { pub fn parse_typename(&mut self, tokens: &mut TokenIterator) -> Result { match tokens.peek_token_or_err()?.token() { Token::Star => self.parse_pointer(tokens), - Token::Ident => Ok(self.push_tag(Tag::Ident { - name: tokens.next().unwrap().lexeme().to_owned(), - })), + Token::Ident => { + let token = tokens.next().unwrap(); + match Self::try_parse_integral_type(token.lexeme())? { + Some(int) => Ok(self.push_tag(Tag::IntegralType(int))), + None => Ok(self.push_tag(Tag::Ident { + name: token.lexeme().to_owned(), + })), + } + } _ => self.parse_primitive_type(tokens), } } diff --git a/src/tokens.rs b/src/tokens.rs index 3188943..9a3f0c0 100644 --- a/src/tokens.rs +++ b/src/tokens.rs @@ -53,6 +53,7 @@ macro_rules! tokens { tokens!(pub Token: { Eof, + ParseError, // Marker Token for any Comment Comment, DocComment, diff --git a/tests/faulty/non_id_start.sea b/tests/faulty/non_id_start.sea new file mode 100644 index 0000000..eeb75c6 --- /dev/null +++ b/tests/faulty/non_id_start.sea @@ -0,0 +1,4 @@ + +fn 234test() { + return 3; +} \ No newline at end of file