from-scratch/lang/tests/tokens.rs
2025-10-30 00:22:24 +01:00

299 lines
8.4 KiB
Rust

#[path = "shared/shared.rs"]
mod util;
use util::*;
#[derive(Debug)]
struct Lexeme(u8, &'static str);
impl PartialEq for Lexeme {
fn eq(&self, other: &Self) -> bool {
match self.0 {
// Identifiers and numbers compare both token and lexeme
30 | 31 => self.0 == other.0 && self.1 == other.1,
_ => self.0 == other.0,
}
}
}
impl Eq for Lexeme {}
trait AsLexeme {
fn as_lexeme(self) -> Option<Lexeme>;
}
#[repr(C)]
struct LexemeRaw {
token: u8,
lexeme: *const u8,
len: usize,
}
impl AsLexeme for LexemeRaw {
fn as_lexeme(self) -> Option<Lexeme> {
let Self { token, lexeme, len } = self;
let slice =
unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
match token {
1.. => Some(Lexeme(token, slice)),
_ => None,
}
}
}
#[allow(dead_code)]
unsafe extern "C" {
unsafe fn tokeniser_init(path: *const i8) -> ();
unsafe fn tokeniser_init_buf(bytes: *const u8, len: usize) -> ();
unsafe fn tokeniser_print() -> ();
unsafe fn is_ident(len: usize) -> bool;
unsafe fn is_number(len: usize) -> bool;
unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> LexemeRaw;
unsafe fn expect_token(token: u8) -> MaybeFFISlice;
unsafe fn unwrap_token(token: u8) -> FFISlice;
static mut LEXEMES: *const u8;
static mut LEXEME_LENS: usize;
static mut NUM_LEXEMES: usize;
static mut TOKENS: u8;
static mut input_file: u32;
static mut buffer: *mut u8;
static mut cursor: usize;
static mut buffer_len: usize;
unsafe fn exit(code: i32) -> !;
}
fn collect_tokens() -> Vec<Lexeme> {
let mut lexemes = Vec::new();
unsafe {
while let Some(lexeme) = find_lexeme().as_lexeme() {
lexemes.push(lexeme);
}
}
lexemes
}
fn main() {
unsafe {
use util::defs::*;
// assert initial state
assert_eq!((&raw const input_file).read(), 0);
assert_eq!((&raw const buffer_len).read(), 0);
assert_eq!((&raw const cursor).read(), 0);
assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(4, ""),
Lexeme(1, ""),
Lexeme(2, ""),
Lexeme(3, ""),
Lexeme(4, ""),
Lexeme(8, ""),
Lexeme(13, ""),
Lexeme(11, ""),
Lexeme(10, ""),
Lexeme(9, ""),
Lexeme(5, ""),
][..]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(28, ""),
Lexeme(29, ""),
Lexeme(21, ""),
Lexeme(20, ""),
Lexeme(24, ""),
Lexeme(12, ""),
Lexeme(23, ""),
Lexeme(22, ""),
Lexeme(15, ""),
][..]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_IDENT, "this-is-an-ident"),
Lexeme(TOKEN_IDENT, "another_ident123"),
Lexeme(TOKEN_IDENT, "_underscore_test"),
Lexeme(TOKEN_IDENT, "mixedCASEIdent"),
Lexeme(TOKEN_IDENT, "number12345"),
Lexeme(TOKEN_IDENT, "____"),
Lexeme(TOKEN_IDENT, "_"),
Lexeme(17, ""),
Lexeme(TOKEN_IDENT, "leading-minus"),
Lexeme(TOKEN_IDENT, "trailing-minus-"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/function.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(4, ""),
Lexeme(TOKEN_IDENT, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
Lexeme(11, ""),
Lexeme(21, ""),
Lexeme(5, ""),
Lexeme(10, ""),
Lexeme(23, ""),
Lexeme(20, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/function.l".as_ptr());
eprintln!("ok.");
assert_eq!(expect_token(2).into_option(), None);
assert_eq!(expect_token(4).into_option().unwrap().as_str(), "fn");
assert_eq!(unwrap_token(TOKEN_IDENT).as_str(), "my-function");
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/comment.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_COMMENT, ""),
Lexeme(4, ""),
Lexeme(TOKEN_IDENT, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
Lexeme(11, ""),
Lexeme(21, ""),
Lexeme(TOKEN_COMMENT, ""),
Lexeme(5, ""),
Lexeme(10, ""),
Lexeme(23, ""),
Lexeme(20, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/number.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_NUMBER, "1234"),
Lexeme(TOKEN_NUMBER, "123_345_"),
Lexeme(TOKEN_NUMBER, "1234____56"),
Lexeme(TOKEN_NUMBER, "1"),
Lexeme(TOKEN_NUMBER, "0"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/strings.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_STRING, "\"this is a string\""),
Lexeme(
TOKEN_STRING,
"\"another\nstring\nspanning multiple\n lines\""
),
Lexeme(TOKEN_STRING, "\"string with a \\\"quoted\\\" word\""),
Lexeme(TOKEN_STRING, "\"a\""),
Lexeme(TOKEN_STRING, "\"\"")
],
);
eprint!("Initializing tokeniser.. ");
let src = b"3 + 4";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_NUMBER, "3"),
Lexeme(16, "+"),
Lexeme(TOKEN_NUMBER, "4")
],
);
eprint!("Initializing tokeniser.. ");
let src = b"fn main() -> void { return 1 + 2; }";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(4, "fn"),
Lexeme(TOKEN_IDENT, "main"),
Lexeme(19, "("),
Lexeme(18, ")"),
Lexeme(12, "->"),
Lexeme(30, "void"),
Lexeme(21, "{"),
Lexeme(5, "return"),
Lexeme(TOKEN_NUMBER, "1"),
Lexeme(16, "+"),
Lexeme(TOKEN_NUMBER, "2"),
Lexeme(23, ";"),
Lexeme(20, "}"),
],
);
eprint!("Initializing tokeniser.. ");
let src = b"(b / d + c) * 42;";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(19, "("),
Lexeme(33, "b"),
Lexeme(31, "/"),
Lexeme(33, "d"),
Lexeme(16, "+"),
Lexeme(33, "c"),
Lexeme(18, ")"),
Lexeme(32, "*"),
Lexeme(34, "42"),
Lexeme(23, ";")
],
);
eprintln!("Finished tokenising.");
}
}