simple lexing error reporting
This commit is contained in:
parent
69e67c882d
commit
02be9bdc26
|
@ -4,6 +4,8 @@ version = "0.1.0"
|
|||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
ansi_term = "0.12.1"
|
||||
clap = "4.5.14"
|
||||
itertools = "0.13.0"
|
||||
log = "0.4.22"
|
||||
thiserror = "1.0.63"
|
||||
|
|
54
src/bin/tokenizer.rs
Normal file
54
src/bin/tokenizer.rs
Normal file
|
@ -0,0 +1,54 @@
|
|||
use std::{io::Read, path::PathBuf};
|
||||
|
||||
use compiler::*;
|
||||
use lexer::SourceLocation;
|
||||
|
||||
fn main() {
|
||||
let cmd = clap::Command::new("sea-tokens").bin_name("sea-tokens").arg(
|
||||
clap::Arg::new("input")
|
||||
.short('i')
|
||||
.help("sea source file.")
|
||||
.value_parser(clap::builder::PathBufValueParser::new()),
|
||||
);
|
||||
|
||||
let matches = cmd.get_matches();
|
||||
let path = matches.get_one::<PathBuf>("input");
|
||||
let source = path
|
||||
.and_then(|p| std::fs::read(p).ok())
|
||||
.or_else(|| {
|
||||
let mut buf = Vec::new();
|
||||
std::io::stdin().read(&mut buf).ok()?;
|
||||
Some(buf)
|
||||
})
|
||||
.expect("no source bytes.");
|
||||
|
||||
let tokens = tokenize(&source);
|
||||
match tokens {
|
||||
Ok(tokens) => {
|
||||
for tok in tokens.iter() {
|
||||
println!("{}@[{}]", tok.token(), tok.source_location().start);
|
||||
}
|
||||
}
|
||||
Err((tokens, errors)) => {
|
||||
eprint!("{} errors while tokenizing", errors.len());
|
||||
match path {
|
||||
Some(path) => {
|
||||
eprint!("{}", path.display());
|
||||
}
|
||||
None => {
|
||||
eprint!("stdin");
|
||||
}
|
||||
}
|
||||
eprintln!(":");
|
||||
for error in &errors {
|
||||
let lines = tokens.src().get_lines(error.range.start, error.range.end);
|
||||
let location = tokens
|
||||
.src()
|
||||
.get_source_span(error.range.start, error.range.end);
|
||||
|
||||
eprintln!("Error: {}", error.err);
|
||||
SourceLocation::squiggle_line(location, lines);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
217
src/lexer.rs
217
src/lexer.rs
|
@ -1,3 +1,5 @@
|
|||
use std::fmt::Display;
|
||||
|
||||
use crate::tokens::Token;
|
||||
use crate::tokens::TokenPos;
|
||||
use itertools::Itertools;
|
||||
|
@ -11,18 +13,20 @@ pub enum Error {
|
|||
StringError(String),
|
||||
#[error("Exp part of floating constant had no digits.")]
|
||||
FloatingConstantExpPartNoDigit,
|
||||
#[error("Dummy Message.")]
|
||||
#[error("constant cannot start with leading underscore '_'.")]
|
||||
NumericalConstantDigitLeadingUnderscore,
|
||||
#[error("Dummy Message.")]
|
||||
#[error("Expected digit here for constant.")]
|
||||
NumericalConstantDigitNoDigit,
|
||||
#[error("Dummy Message.")]
|
||||
#[error("Expected digit here for integer constant.")]
|
||||
IntegralTypeExpectedDigit,
|
||||
#[error("Dummy Message.")]
|
||||
#[error("Floating constant has invalid trailing type.")]
|
||||
FloatingConstantInvalidTrailingType,
|
||||
#[error("Dummy Message.")]
|
||||
#[error("Invalid token.")]
|
||||
InvalidToken,
|
||||
#[error("Dummy Message.")]
|
||||
#[error("Identifier starts with invalid character.")]
|
||||
ExpectedIdStartForIdentifier,
|
||||
#[error("Unknown suffix in constant.")]
|
||||
NumericalConstantUnknownSuffix,
|
||||
}
|
||||
|
||||
pub type Result<T> = core::result::Result<T, Error>;
|
||||
|
@ -35,11 +39,22 @@ pub struct Chars<'a> {
|
|||
|
||||
impl<'a> Chars<'a> {
|
||||
pub fn as_str(&self) -> &str {
|
||||
unsafe { core::str::from_utf8_unchecked(&self.bytes[self.offset..]) }
|
||||
let offset = self.offset.min(self.num_bytes());
|
||||
unsafe { core::str::from_utf8_unchecked(&self.bytes[offset..]) }
|
||||
}
|
||||
|
||||
pub fn seek(&mut self, offset: u32) {
|
||||
self.offset = offset as usize;
|
||||
}
|
||||
|
||||
pub fn num_bytes(&self) -> usize {
|
||||
self.bytes.len()
|
||||
}
|
||||
|
||||
pub fn is_eof(&self) -> bool {
|
||||
self.offset >= self.bytes.len()
|
||||
}
|
||||
|
||||
pub fn peek(&self) -> Option<char> {
|
||||
self.clone().next()
|
||||
}
|
||||
|
@ -52,13 +67,9 @@ impl<'a> Chars<'a> {
|
|||
self.offset
|
||||
}
|
||||
|
||||
pub fn get_source_span(
|
||||
&self,
|
||||
start: u32,
|
||||
end: u32,
|
||||
) -> std::ops::RangeInclusive<SourceLocation> {
|
||||
pub fn get_source_span(&self, start: u32, end: u32) -> std::ops::Range<SourceLocation> {
|
||||
let (start_l, start_c) = {
|
||||
let range = self.get_range(0, start);
|
||||
let range = self.get_from_to(0, start);
|
||||
range.chars().fold((1u32, 0u32), |(line, col), c| {
|
||||
if c == '\n' {
|
||||
(line + 1, 0)
|
||||
|
@ -68,7 +79,7 @@ impl<'a> Chars<'a> {
|
|||
})
|
||||
};
|
||||
let (end_l, end_c) = {
|
||||
let range = self.get_range(start, end);
|
||||
let range = self.get_from_to(start, end);
|
||||
range.chars().fold((start_l, start_c), |(line, col), c| {
|
||||
if c == '\n' {
|
||||
(line + 1, 0)
|
||||
|
@ -78,13 +89,40 @@ impl<'a> Chars<'a> {
|
|||
})
|
||||
};
|
||||
|
||||
core::ops::RangeInclusive::new(
|
||||
SourceLocation::new(start_l, start_c),
|
||||
SourceLocation::new(end_l, end_c),
|
||||
)
|
||||
core::ops::Range {
|
||||
start: SourceLocation::new(start_l, start_c),
|
||||
end: SourceLocation::new(end_l, end_c),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_range(&self, start: u32, end: u32) -> &str {
|
||||
pub fn get_lines(&self, start: u32, end: u32) -> &str {
|
||||
let range = self.get_from_to(0, start);
|
||||
let start = range
|
||||
.char_indices()
|
||||
.rev()
|
||||
.skip_while(|&(_, c)| c != '\n')
|
||||
.next()
|
||||
.map(|(idx, c)| idx + c.len_utf8())
|
||||
.unwrap_or(0);
|
||||
|
||||
let range = self.get_from_to(end, self.num_bytes() as u32);
|
||||
let end = range
|
||||
.char_indices()
|
||||
.skip_while(|&(_, c)| c != '\n')
|
||||
.next()
|
||||
.map(|(idx, _)| idx as u32 + end)
|
||||
.unwrap_or(self.num_bytes() as u32);
|
||||
|
||||
self.get_from_to(start as u32, end as u32)
|
||||
}
|
||||
|
||||
pub fn get_range(&self, range: core::ops::Range<u32>) -> &str {
|
||||
unsafe {
|
||||
core::str::from_utf8_unchecked(&self.bytes[range.start as usize..range.end as usize])
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_from_to(&self, start: u32, end: u32) -> &str {
|
||||
unsafe { core::str::from_utf8_unchecked(&self.bytes[start as usize..end as usize]) }
|
||||
}
|
||||
|
||||
|
@ -163,10 +201,45 @@ pub struct SourceLocation {
|
|||
pub column: u32,
|
||||
}
|
||||
|
||||
impl Display for SourceLocation {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "l:{},c:{}", self.line, self.column)
|
||||
}
|
||||
}
|
||||
|
||||
impl SourceLocation {
|
||||
pub fn new(line: u32, column: u32) -> Self {
|
||||
Self { line, column }
|
||||
}
|
||||
|
||||
pub fn squiggle_line(this: core::ops::Range<Self>, lines: &str) {
|
||||
let lines = lines.lines();
|
||||
let squiggle_lines = this.end.line - this.start.line;
|
||||
|
||||
for (i, line) in lines.enumerate() {
|
||||
println!("{line}");
|
||||
let squiggle_range = {
|
||||
let start = if i == 0 { this.start.column } else { 0 };
|
||||
let end = if i as u32 + 1 == squiggle_lines {
|
||||
this.end.column
|
||||
} else {
|
||||
line.len() as u32
|
||||
};
|
||||
start..end
|
||||
};
|
||||
|
||||
if !squiggle_range.is_empty() {
|
||||
for _ in 0..squiggle_range.start {
|
||||
print!(" ");
|
||||
}
|
||||
print!("{}", ansi_term::Colour::Red.paint("^"));
|
||||
for _ in squiggle_range.start..(squiggle_range.end - 1) {
|
||||
print!("{}", ansi_term::Colour::Red.paint("~"));
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenItem<'a> {
|
||||
|
@ -177,10 +250,10 @@ impl<'a> TokenItem<'a> {
|
|||
pub fn lexeme(&self) -> &str {
|
||||
self.tokenizer
|
||||
.source
|
||||
.get_range(self.inner.start, self.inner.end)
|
||||
.get_from_to(self.inner.start, self.inner.end)
|
||||
}
|
||||
|
||||
pub fn source_location(&self) -> std::ops::RangeInclusive<SourceLocation> {
|
||||
pub fn source_location(&self) -> std::ops::Range<SourceLocation> {
|
||||
self.tokenizer
|
||||
.source
|
||||
.get_source_span(self.inner.start, self.inner.end)
|
||||
|
@ -275,6 +348,11 @@ macro_rules! residual {
|
|||
};
|
||||
}
|
||||
|
||||
pub struct TokenizeError {
|
||||
pub err: Error,
|
||||
pub range: core::ops::Range<u32>,
|
||||
}
|
||||
|
||||
impl<'a> Tokenizer<'a> {
|
||||
pub fn iter(&self) -> TokenIterator {
|
||||
TokenIterator {
|
||||
|
@ -283,6 +361,61 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn src(&self) -> &Chars<'a> {
|
||||
&self.source
|
||||
}
|
||||
|
||||
pub fn new_with_errors(
|
||||
bytes: &'a [u8],
|
||||
) -> core::result::Result<Self, (Self, Vec<TokenizeError>)> {
|
||||
let mut this = Self {
|
||||
source: Chars { bytes, offset: 0 },
|
||||
tokens: Vec::new(),
|
||||
};
|
||||
let mut errors = Vec::new();
|
||||
|
||||
loop {
|
||||
if this.source.is_eof() {
|
||||
break;
|
||||
}
|
||||
|
||||
let start = this.source.position();
|
||||
|
||||
match this.next_token() {
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
// let is_quoted = this
|
||||
// .source
|
||||
// .get_range(start, this.source.bytes.len() as u32)
|
||||
// .chars()
|
||||
// .take_while_ref(|&c| crate::common::is_whitespace(c))
|
||||
// .next()
|
||||
// .map(|c| c == '\'' || c == '"')
|
||||
// .unwrap_or(false);
|
||||
let end = this.source.position();
|
||||
|
||||
if this.source.peek().map(|c| crate::common::is_whitespace(c)) != Some(true) {
|
||||
this.source
|
||||
.take_while_ref(|&c| !crate::common::is_whitespace(c))
|
||||
.count();
|
||||
}
|
||||
|
||||
_ = this.push_token(Token::ParseError, start, end);
|
||||
errors.push(TokenizeError {
|
||||
err: e,
|
||||
range: start..end,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if errors.is_empty() {
|
||||
Ok(this)
|
||||
} else {
|
||||
Err((this, errors))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(bytes: &'a [u8]) -> Result<Tokenizer<'a>> {
|
||||
let mut this = Self {
|
||||
source: Chars { bytes, offset: 0 },
|
||||
|
@ -326,23 +459,16 @@ impl<'a> Tokenizer<'a> {
|
|||
|
||||
let start = self.source.position();
|
||||
|
||||
let token = self.source.try_parse_result(|source| {
|
||||
let a = try_parse_integral_type(source).map(|o| o.map(|_| Token::IntegralType));
|
||||
residual!(none: a);
|
||||
|
||||
let mut peeking = source.clone();
|
||||
let token = {
|
||||
let mut peeking = self.source.clone();
|
||||
match peeking.next() {
|
||||
Some('0'..='9') => {
|
||||
return Ok(Some(parse_constant(source)?));
|
||||
Some('0'..='9') => Some(parse_constant(&mut self.source)?),
|
||||
Some('.') if peeking.next().map(|c| crate::common::is_digit(c)) == Some(true) => {
|
||||
Some(parse_constant(&mut self.source)?)
|
||||
}
|
||||
Some('.') if peeking.next().map(|c| ['b', 'x', 'o'].contains(&c)) == Some(true) => {
|
||||
return Ok(Some(parse_constant(source)?));
|
||||
}
|
||||
_ => {}
|
||||
_ => None,
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
})?;
|
||||
};
|
||||
|
||||
if let Some(token) = token {
|
||||
return self.push_token(token, start, self.source.position());
|
||||
|
@ -623,7 +749,7 @@ fn try_parse_exp_part(source: &mut Chars) -> Result<Option<()>> {
|
|||
/// DEC_DIGITS FloatingType?
|
||||
/// `.` DEC_DIGITS EXP_PART? FloatingType?
|
||||
/// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
|
||||
fn parse_constant(source: &mut Chars) -> Result<Token> {
|
||||
fn parse_constant_inner(source: &mut Chars) -> Result<Token> {
|
||||
let zero = source.next_if(|&c| c == '0').is_some();
|
||||
let radix = zero
|
||||
.then(|| source.next_if_map(|c| Radix::from_char(c)))
|
||||
|
@ -680,3 +806,22 @@ fn parse_constant(source: &mut Chars) -> Result<Token> {
|
|||
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
/// CONSTANT <-
|
||||
/// DEC_DIGITS IntegralType?
|
||||
/// `0x` HEX_DIGITS IntegralType?
|
||||
/// `0b` BIN_DIGITS IntegralType?
|
||||
/// `0o` OCT_DIGITS IntegralType?
|
||||
/// DEC_DIGITS FloatingType?
|
||||
/// `.` DEC_DIGITS EXP_PART? FloatingType?
|
||||
/// DEC_DIGITS `.` DEC_DIGITS? EXP_PART? FloatingType?
|
||||
fn parse_constant(source: &mut Chars) -> Result<Token> {
|
||||
let constant = parse_constant_inner(source)?;
|
||||
// char following a constant must not be id_continue
|
||||
source
|
||||
.peek()
|
||||
.filter(|&c| !crate::common::is_id_continue(c))
|
||||
.ok_or(Error::NumericalConstantUnknownSuffix)?;
|
||||
|
||||
Ok(constant)
|
||||
}
|
||||
|
|
16
src/lib.rs
16
src/lib.rs
|
@ -1,8 +1,14 @@
|
|||
#![feature(extract_if, iter_advance_by)]
|
||||
#![allow(dead_code, unused_macros)]
|
||||
|
||||
mod ast;
|
||||
mod common;
|
||||
mod lexer;
|
||||
mod parser;
|
||||
mod tokens;
|
||||
pub mod ast;
|
||||
pub mod common;
|
||||
pub mod lexer;
|
||||
pub mod parser;
|
||||
pub mod tokens;
|
||||
|
||||
pub fn tokenize<'a>(
|
||||
bytes: &'a [u8],
|
||||
) -> Result<lexer::Tokenizer<'a>, (lexer::Tokenizer<'a>, Vec<lexer::TokenizeError>)> {
|
||||
lexer::Tokenizer::new_with_errors(bytes)
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ use itertools::Itertools;
|
|||
|
||||
use crate::{
|
||||
ast::{FloatingType, IntegralType, LetOrVar, Node, PrimitiveType, Tag},
|
||||
common::NextIf,
|
||||
lexer::{Radix, TokenIterator},
|
||||
tokens::Token,
|
||||
};
|
||||
|
@ -18,6 +19,8 @@ pub enum Error {
|
|||
ExpectedTokenNotFound(Token),
|
||||
#[error("Dummy message.")]
|
||||
ExpectedLetOrVar,
|
||||
#[error("Dummy message.")]
|
||||
IntegralTypeTooWide,
|
||||
}
|
||||
|
||||
pub type Result<T> = core::result::Result<T, Error>;
|
||||
|
@ -91,6 +94,64 @@ impl Tree {
|
|||
node
|
||||
}
|
||||
|
||||
fn is_integral_type(lexeme: &str) -> Option<()> {
|
||||
let mut iter = lexeme.chars();
|
||||
iter.next_if(|&c| c == 'u' || c == 'i')?;
|
||||
iter.next_if(|&c| crate::common::is_digit(c))?;
|
||||
iter.take_while_ref(|&c| crate::common::is_digit(c)).count();
|
||||
iter.next().is_none().then_some(())
|
||||
}
|
||||
|
||||
// returns an option instead of a result because failure here means the
|
||||
// lexeme is actually an identifier.
|
||||
fn try_parse_integral_type(lexeme: &str) -> Result<Option<IntegralType>> {
|
||||
let mut iter = lexeme.chars().peekable();
|
||||
let signed = match iter.next() {
|
||||
Some('u') => false,
|
||||
Some('i') => true,
|
||||
_ => {
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
// need 1 digit for an integral type
|
||||
if iter.peek().map(|&c| crate::common::is_digit(c)) != Some(true) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// need no nondigits after digits
|
||||
if iter
|
||||
.clone()
|
||||
.skip_while(|&c| crate::common::is_digit(c))
|
||||
.next()
|
||||
.is_some()
|
||||
{
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut bits = 0u16;
|
||||
loop {
|
||||
let Some(digit) = iter.next().map(|c| c as u8 - b'0') else {
|
||||
break;
|
||||
};
|
||||
|
||||
match bits
|
||||
.checked_mul(10)
|
||||
.and_then(|bits| bits.checked_add(digit as u16))
|
||||
{
|
||||
Some(val) => {
|
||||
bits = val;
|
||||
}
|
||||
None => {
|
||||
// this IS an integral type, but it is bigger than u/i65535
|
||||
return Err(Error::IntegralTypeTooWide);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Some(IntegralType { signed, bits }))
|
||||
}
|
||||
|
||||
/// returns (signed, bits)
|
||||
fn parse_integral_type(lexeme: &str) -> IntegralType {
|
||||
let mut iter = lexeme.chars();
|
||||
|
@ -164,10 +225,6 @@ impl Tree {
|
|||
pub fn parse_primitive_type(&mut self, tokens: &mut TokenIterator) -> Result<Node> {
|
||||
let token = tokens.next().ok_or(Error::UnexpectedEndOfTokens)?;
|
||||
let prim = match token.token() {
|
||||
Token::IntegralType => {
|
||||
let int = Self::parse_integral_type(token.lexeme());
|
||||
return Ok(self.push_tag(Tag::IntegralType(int)));
|
||||
}
|
||||
Token::Void => PrimitiveType::Void,
|
||||
Token::Bool => PrimitiveType::Bool,
|
||||
Token::F32 => PrimitiveType::FloatingType(FloatingType::Binary32),
|
||||
|
@ -191,9 +248,15 @@ impl Tree {
|
|||
pub fn parse_typename(&mut self, tokens: &mut TokenIterator) -> Result<Node> {
|
||||
match tokens.peek_token_or_err()?.token() {
|
||||
Token::Star => self.parse_pointer(tokens),
|
||||
Token::Ident => Ok(self.push_tag(Tag::Ident {
|
||||
name: tokens.next().unwrap().lexeme().to_owned(),
|
||||
})),
|
||||
Token::Ident => {
|
||||
let token = tokens.next().unwrap();
|
||||
match Self::try_parse_integral_type(token.lexeme())? {
|
||||
Some(int) => Ok(self.push_tag(Tag::IntegralType(int))),
|
||||
None => Ok(self.push_tag(Tag::Ident {
|
||||
name: token.lexeme().to_owned(),
|
||||
})),
|
||||
}
|
||||
}
|
||||
_ => self.parse_primitive_type(tokens),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,6 +53,7 @@ macro_rules! tokens {
|
|||
|
||||
tokens!(pub Token: {
|
||||
Eof,
|
||||
ParseError,
|
||||
// Marker Token for any Comment
|
||||
Comment,
|
||||
DocComment,
|
||||
|
|
4
tests/faulty/non_id_start.sea
Normal file
4
tests/faulty/non_id_start.sea
Normal file
|
@ -0,0 +1,4 @@
|
|||
|
||||
fn 234test() {
|
||||
return 3;
|
||||
}
|
Loading…
Reference in a new issue