simple lexing error reporting

This commit is contained in:
Janis 2024-08-09 18:58:26 +02:00
parent 69e67c882d
commit 02be9bdc26
7 changed files with 323 additions and 48 deletions

View file

@ -4,6 +4,8 @@ version = "0.1.0"
edition = "2021"
[dependencies]
ansi_term = "0.12.1"
clap = "4.5.14"
itertools = "0.13.0"
log = "0.4.22"
thiserror = "1.0.63"

54
src/bin/tokenizer.rs Normal file
View file

@ -0,0 +1,54 @@
use std::{io::Read, path::PathBuf};
use compiler::*;
use lexer::SourceLocation;
fn main() {
let cmd = clap::Command::new("sea-tokens").bin_name("sea-tokens").arg(
clap::Arg::new("input")
.short('i')
.help("sea source file.")
.value_parser(clap::builder::PathBufValueParser::new()),
);
let matches = cmd.get_matches();
let path = matches.get_one::<PathBuf>("input");
let source = path
.and_then(|p| std::fs::read(p).ok())
.or_else(|| {
let mut buf = Vec::new();
std::io::stdin().read(&mut buf).ok()?;
Some(buf)
})
.expect("no source bytes.");
let tokens = tokenize(&source);
match tokens {
Ok(tokens) => {
for tok in tokens.iter() {
println!("{}@[{}]", tok.token(), tok.source_location().start);
}
}
Err((tokens, errors)) => {
eprint!("{} errors while tokenizing", errors.len());
match path {
Some(path) => {
eprint!("{}", path.display());
}
None => {
eprint!("stdin");
}
}
eprintln!(":");
for error in &errors {
let lines = tokens.src().get_lines(error.range.start, error.range.end);
let location = tokens
.src()
.get_source_span(error.range.start, error.range.end);
eprintln!("Error: {}", error.err);
SourceLocation::squiggle_line(location, lines);
}
}
}
}

View file

@ -1,3 +1,5 @@
use std::fmt::Display;
use crate::tokens::Token;
use crate::tokens::TokenPos;
use itertools::Itertools;
@ -11,18 +13,20 @@ pub enum Error {
StringError(String),
#[error("Exp part of floating constant had no digits.")]
FloatingConstantExpPartNoDigit,
#[error("Dummy Message.")]
#[error("constant cannot start with leading underscore '_'.")]
NumericalConstantDigitLeadingUnderscore,
#[error("Dummy Message.")]
#[error("Expected digit here for constant.")]
NumericalConstantDigitNoDigit,
#[error("Dummy Message.")]
#[error("Expected digit here for integer constant.")]
IntegralTypeExpectedDigit,
#[error("Dummy Message.")]
#[error("Floating constant has invalid trailing type.")]
FloatingConstantInvalidTrailingType,
#[error("Dummy Message.")]
#[error("Invalid token.")]
InvalidToken,
#[error("Dummy Message.")]
#[error("Identifier starts with invalid character.")]
ExpectedIdStartForIdentifier,
#[error("Unknown suffix in constant.")]
NumericalConstantUnknownSuffix,
}
pub type Result<T> = core::result::Result<T, Error>;
@ -35,11 +39,22 @@ pub struct Chars<'a> {
impl<'a> Chars<'a> {
pub fn as_str(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(&self.bytes[self.offset..]) }
let offset = self.offset.min(self.num_bytes());
unsafe { core::str::from_utf8_unchecked(&self.bytes[offset..]) }
}
pub fn seek(&mut self, offset: u32) {
self.offset = offset as usize;
}
pub fn num_bytes(&self) -> usize {
self.bytes.len()
}
pub fn is_eof(&self) -> bool {
self.offset >= self.bytes.len()
}
pub fn peek(&self) -> Option<char> {
self.clone().next()
}
@ -52,13 +67,9 @@ impl<'a> Chars<'a> {
self.offset
}
pub fn get_source_span(
&self,
start: u32,
end: u32,
) -> std::ops::RangeInclusive<SourceLocation> {
pub fn get_source_span(&self, start: u32, end: u32) -> std::ops::Range<SourceLocation> {
let (start_l, start_c) = {
let range = self.get_range(0, start);
let range = self.get_from_to(0, start);
range.chars().fold((1u32, 0u32), |(line, col), c| {
if c == '\n' {
(line + 1, 0)
@ -68,7 +79,7 @@ impl<'a> Chars<'a> {
})
};
let (end_l, end_c) = {
let range = self.get_range(start, end);
let range = self.get_from_to(start, end);
range.chars().fold((start_l, start_c), |(line, col), c| {
if c == '\n' {
(line + 1, 0)
@ -78,13 +89,40 @@ impl<'a> Chars<'a> {
})
};
core::ops::RangeInclusive::new(
SourceLocation::new(start_l, start_c),
SourceLocation::new(end_l, end_c),
)
core::ops::Range {
start: SourceLocation::new(start_l, start_c),
end: SourceLocation::new(end_l, end_c),
}
}
pub fn get_range(&self, start: u32, end: u32) -> &str {
pub fn get_lines(&self, start: u32, end: u32) -> &str {
let range = self.get_from_to(0, start);
let start = range
.char_indices()
.rev()
.skip_while(|&(_, c)| c != '\n')
.next()
.map(|(idx, c)| idx + c.len_utf8())
.unwrap_or(0);
let range = self.get_from_to(end, self.num_bytes() as u32);
let end = range
.char_indices()
.skip_while(|&(_, c)| c != '\n')
.next()
.map(|(idx, _)| idx as u32 + end)
.unwrap_or(self.num_bytes() as u32);
self.get_from_to(start as u32, end as u32)
}
pub fn get_range(&self, range: core::ops::Range<u32>) -> &str {
unsafe {
core::str::from_utf8_unchecked(&self.bytes[range.start as usize..range.end as usize])
}
}
pub fn get_from_to(&self, start: u32, end: u32) -> &str {
unsafe { core::str::from_utf8_unchecked(&self.bytes[start as usize..end as usize]) }
}
@ -163,10 +201,45 @@ pub struct SourceLocation {
pub column: u32,
}
impl Display for SourceLocation {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "l:{},c:{}", self.line, self.column)
}
}
impl SourceLocation {
pub fn new(line: u32, column: u32) -> Self {
Self { line, column }
}
pub fn squiggle_line(this: core::ops::Range<Self>, lines: &str) {
let lines = lines.lines();
let squiggle_lines = this.end.line - this.start.line;
for (i, line) in lines.enumerate() {
println!("{line}");
let squiggle_range = {
let start = if i == 0 { this.start.column } else { 0 };
let end = if i as u32 + 1 == squiggle_lines {
this.end.column
} else {
line.len() as u32
};
start..end
};
if !squiggle_range.is_empty() {
for _ in 0..squiggle_range.start {
print!(" ");
}
print!("{}", ansi_term::Colour::Red.paint("^"));
for _ in squiggle_range.start..(squiggle_range.end - 1) {
print!("{}", ansi_term::Colour::Red.paint("~"));
}
println!();
}
}
}
}
impl<'a> TokenItem<'a> {
@ -177,10 +250,10 @@ impl<'a> TokenItem<'a> {
pub fn lexeme(&self) -> &str {
self.tokenizer
.source
.get_range(self.inner.start, self.inner.end)
.get_from_to(self.inner.start, self.inner.end)
}
pub fn source_location(&self) -> std::ops::RangeInclusive<SourceLocation> {
pub fn source_location(&self) -> std::ops::Range<SourceLocation> {
self.tokenizer
.source
.get_source_span(self.inner.start, self.inner.end)
@ -275,6 +348,11 @@ macro_rules! residual {
};
}
pub struct TokenizeError {
pub err: Error,
pub range: core::ops::Range<u32>,
}
impl<'a> Tokenizer<'a> {
pub fn iter(&self) -> TokenIterator {
TokenIterator {
@ -283,6 +361,61 @@ impl<'a> Tokenizer<'a> {
}
}
pub fn src(&self) -> &Chars<'a> {
&self.source
}
pub fn new_with_errors(
bytes: &'a [u8],
) -> core::result::Result<Self, (Self, Vec<TokenizeError>)> {
let mut this = Self {
source: Chars { bytes, offset: 0 },
tokens: Vec::new(),
};
let mut errors = Vec::new();
loop {
if this.source.is_eof() {
break;
}
let start = this.source.position();
match this.next_token() {
Ok(_) => {}
Err(e) => {
// let is_quoted = this
// .source
// .get_range(start, this.source.bytes.len() as u32)
// .chars()
// .take_while_ref(|&c| crate::common::is_whitespace(c))
// .next()
// .map(|c| c == '\'' || c == '"')
// .unwrap_or(false);
let end = this.source.position();
if this.source.peek().map(|c| crate::common::is_whitespace(c)) != Some(true) {
this.source
.take_while_ref(|&c| !crate::common::is_whitespace(c))
.count();
}
_ = this.push_token(Token::ParseError, start, end);
errors.push(TokenizeError {
err: e,
range: start..end,
});
}
}
}
if errors.is_empty() {
Ok(this)
} else {
Err((this, errors))
}
}
pub fn new(bytes: &'a [u8]) -> Result<Tokenizer<'a>> {
let mut this = Self {
source: Chars { bytes, offset: 0 },
@ -326,23 +459,16 @@ impl<'a> Tokenizer<'a> {
let start = self.source.position();
let token = self.source.try_parse_result(|source| {
let a = try_parse_integral_type(source).map(|o| o.map(|_| Token::IntegralType));
residual!(none: a);
let mut peeking = source.clone();
let token = {
let mut peeking = self.source.clone();
match peeking.next() {
Some('0'..='9') => {
return Ok(Some(parse_constant(source)?));
Some('0'..='9') => Some(parse_constant(&mut self.source)?),
Some('.') if peeking.next().map(|c| crate::common::is_digit(c)) == Some(true) => {
Some(parse_constant(&mut self.source)?)
}
Some('.') if peeking.next().map(|c| ['b', 'x', 'o'].contains(&c)) == Some(true) => {
return Ok(Some(parse_constant(source)?));
}
_ => {}
_ => None,
}
Ok(None)
})?;
};
if let Some(token) = token {
return self.push_token(token, start, self.source.position());
@ -623,7 +749,7 @@ fn try_parse_exp_part(source: &mut Chars) -> Result<Option<()>> {
/// DEC_DIGITS FloatingType?
/// `.` DEC_DIGITS EXP_PART? FloatingType?
/// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
fn parse_constant(source: &mut Chars) -> Result<Token> {
fn parse_constant_inner(source: &mut Chars) -> Result<Token> {
let zero = source.next_if(|&c| c == '0').is_some();
let radix = zero
.then(|| source.next_if_map(|c| Radix::from_char(c)))
@ -680,3 +806,22 @@ fn parse_constant(source: &mut Chars) -> Result<Token> {
Ok(token)
}
/// CONSTANT <-
/// DEC_DIGITS IntegralType?
/// `0x` HEX_DIGITS IntegralType?
/// `0b` BIN_DIGITS IntegralType?
/// `0o` OCT_DIGITS IntegralType?
/// DEC_DIGITS FloatingType?
/// `.` DEC_DIGITS EXP_PART? FloatingType?
/// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
fn parse_constant(source: &mut Chars) -> Result<Token> {
let constant = parse_constant_inner(source)?;
// char following a constant must not be id_continue
source
.peek()
.filter(|&c| !crate::common::is_id_continue(c))
.ok_or(Error::NumericalConstantUnknownSuffix)?;
Ok(constant)
}

View file

@ -1,8 +1,14 @@
#![feature(extract_if, iter_advance_by)]
#![allow(dead_code, unused_macros)]
mod ast;
mod common;
mod lexer;
mod parser;
mod tokens;
pub mod ast;
pub mod common;
pub mod lexer;
pub mod parser;
pub mod tokens;
pub fn tokenize<'a>(
bytes: &'a [u8],
) -> Result<lexer::Tokenizer<'a>, (lexer::Tokenizer<'a>, Vec<lexer::TokenizeError>)> {
lexer::Tokenizer::new_with_errors(bytes)
}

View file

@ -4,6 +4,7 @@ use itertools::Itertools;
use crate::{
ast::{FloatingType, IntegralType, LetOrVar, Node, PrimitiveType, Tag},
common::NextIf,
lexer::{Radix, TokenIterator},
tokens::Token,
};
@ -18,6 +19,8 @@ pub enum Error {
ExpectedTokenNotFound(Token),
#[error("Dummy message.")]
ExpectedLetOrVar,
#[error("Dummy message.")]
IntegralTypeTooWide,
}
pub type Result<T> = core::result::Result<T, Error>;
@ -91,6 +94,64 @@ impl Tree {
node
}
fn is_integral_type(lexeme: &str) -> Option<()> {
let mut iter = lexeme.chars();
iter.next_if(|&c| c == 'u' || c == 'i')?;
iter.next_if(|&c| crate::common::is_digit(c))?;
iter.take_while_ref(|&c| crate::common::is_digit(c)).count();
iter.next().is_none().then_some(())
}
// returns an option instead of a result because failure here means the
// lexeme is actually an identifier.
fn try_parse_integral_type(lexeme: &str) -> Result<Option<IntegralType>> {
let mut iter = lexeme.chars().peekable();
let signed = match iter.next() {
Some('u') => false,
Some('i') => true,
_ => {
return Ok(None);
}
};
// need 1 digit for an integral type
if iter.peek().map(|&c| crate::common::is_digit(c)) != Some(true) {
return Ok(None);
}
// need no nondigits after digits
if iter
.clone()
.skip_while(|&c| crate::common::is_digit(c))
.next()
.is_some()
{
return Ok(None);
}
let mut bits = 0u16;
loop {
let Some(digit) = iter.next().map(|c| c as u8 - b'0') else {
break;
};
match bits
.checked_mul(10)
.and_then(|bits| bits.checked_add(digit as u16))
{
Some(val) => {
bits = val;
}
None => {
// this IS an integral type, but it is bigger than u/i65535
return Err(Error::IntegralTypeTooWide);
}
}
}
Ok(Some(IntegralType { signed, bits }))
}
/// returns (signed, bits)
fn parse_integral_type(lexeme: &str) -> IntegralType {
let mut iter = lexeme.chars();
@ -164,10 +225,6 @@ impl Tree {
pub fn parse_primitive_type(&mut self, tokens: &mut TokenIterator) -> Result<Node> {
let token = tokens.next().ok_or(Error::UnexpectedEndOfTokens)?;
let prim = match token.token() {
Token::IntegralType => {
let int = Self::parse_integral_type(token.lexeme());
return Ok(self.push_tag(Tag::IntegralType(int)));
}
Token::Void => PrimitiveType::Void,
Token::Bool => PrimitiveType::Bool,
Token::F32 => PrimitiveType::FloatingType(FloatingType::Binary32),
@ -191,9 +248,15 @@ impl Tree {
pub fn parse_typename(&mut self, tokens: &mut TokenIterator) -> Result<Node> {
match tokens.peek_token_or_err()?.token() {
Token::Star => self.parse_pointer(tokens),
Token::Ident => Ok(self.push_tag(Tag::Ident {
name: tokens.next().unwrap().lexeme().to_owned(),
})),
Token::Ident => {
let token = tokens.next().unwrap();
match Self::try_parse_integral_type(token.lexeme())? {
Some(int) => Ok(self.push_tag(Tag::IntegralType(int))),
None => Ok(self.push_tag(Tag::Ident {
name: token.lexeme().to_owned(),
})),
}
}
_ => self.parse_primitive_type(tokens),
}
}

View file

@ -53,6 +53,7 @@ macro_rules! tokens {
tokens!(pub Token: {
Eof,
ParseError,
// Marker Token for any Comment
Comment,
DocComment,

View file

@ -0,0 +1,4 @@
fn 234test() {
return 3;
}