from-scratch/lang/tests/tokens.rs
2025-10-28 20:40:25 +01:00

230 lines
6 KiB
Rust

#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
panic!("Called panic from external code.");
}
#[derive(Debug)]
struct Lexeme(u8, &'static str);
impl PartialEq for Lexeme {
fn eq(&self, other: &Self) -> bool {
match self.0 {
// Identifiers and numbers compare both token and lexeme
30 | 31 => self.0 == other.0 && self.1 == other.1,
_ => self.0 == other.0,
}
}
}
impl Eq for Lexeme {}
impl Lexeme {
fn lex(&self) -> &'static str {
self.1
}
}
trait AsLexeme {
fn as_lexeme(self) -> Option<Lexeme>;
}
#[repr(C)]
struct LexemeRaw {
token: u8,
lexeme: *const u8,
len: usize,
}
impl AsLexeme for LexemeRaw {
fn as_lexeme(self) -> Option<Lexeme> {
let Self { token, lexeme, len } = self;
let slice =
unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
match token {
1.. => Some(Lexeme(token, slice)),
_ => None,
}
}
}
#[allow(dead_code)]
unsafe extern "C" {
unsafe fn tokeniser_init(path: *const i8) -> ();
unsafe fn tokeniser_print() -> ();
unsafe fn is_ident(len: usize) -> bool;
unsafe fn is_number(len: usize) -> bool;
unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> LexemeRaw;
static mut LEXEMES: *const u8;
static mut LEXEME_LENS: usize;
static mut NUM_LEXEMES: usize;
static mut TOKENS: u8;
static mut input_file: u32;
static mut buffer: *mut u8;
static mut cursor: usize;
static mut buffer_len: usize;
unsafe fn exit(code: i32) -> !;
}
fn collect_tokens() -> Vec<Lexeme> {
let mut lexemes = Vec::new();
unsafe {
while let Some(lexeme) = find_lexeme().as_lexeme() {
lexemes.push(lexeme);
}
}
lexemes
}
fn main() {
unsafe {
// assert initial state
assert_eq!((&raw const input_file).read(), 0);
assert_eq!((&raw const buffer_len).read(), 0);
assert_eq!((&raw const cursor).read(), 0);
assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(4, ""),
Lexeme(1, ""),
Lexeme(2, ""),
Lexeme(3, ""),
Lexeme(4, ""),
Lexeme(8, ""),
Lexeme(13, ""),
Lexeme(11, ""),
Lexeme(10, ""),
Lexeme(9, ""),
Lexeme(5, ""),
][..]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(28, ""),
Lexeme(29, ""),
Lexeme(21, ""),
Lexeme(20, ""),
Lexeme(24, ""),
Lexeme(12, ""),
Lexeme(23, ""),
Lexeme(22, ""),
Lexeme(15, ""),
][..]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(30, "this-is-an-ident"),
Lexeme(30, "another_ident123"),
Lexeme(30, "_underscore_test"),
Lexeme(30, "mixedCASEIdent"),
Lexeme(30, "number12345"),
Lexeme(30, "____"),
Lexeme(30, "_"),
Lexeme(17, ""),
Lexeme(30, "leading-minus"),
Lexeme(30, "trailing-minus-"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/function.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(4, ""),
Lexeme(30, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
Lexeme(11, ""),
Lexeme(21, ""),
Lexeme(5, ""),
Lexeme(10, ""),
Lexeme(23, ""),
Lexeme(20, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/comment.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(33, ""),
Lexeme(4, ""),
Lexeme(30, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
Lexeme(11, ""),
Lexeme(21, ""),
Lexeme(33, ""),
Lexeme(5, ""),
Lexeme(10, ""),
Lexeme(23, ""),
Lexeme(20, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/number.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(31, "1234"),
Lexeme(31, "123_345_"),
Lexeme(31, "1234____56"),
Lexeme(31, "1"),
Lexeme(31, "0"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/strings.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(32, "\"this is a string\""),
Lexeme(32, "\"another\nstring\nspanning multiple\n lines\""),
Lexeme(32, "\"string with a \\\"quoted\\\" word\""),
Lexeme(32, "\"a\""),
Lexeme(32, "\"\"")
],
);
eprintln!("Finished tokenising.");
}
}