from-scratch/lang/tests/tokens.rs
2025-11-02 00:00:22 +01:00

326 lines
9.8 KiB
Rust

#[path = "shared/shared.rs"]
mod util;
use util::*;
#[derive(Debug)]
struct Lexeme(u8, &'static str);
impl PartialEq for Lexeme {
fn eq(&self, other: &Self) -> bool {
use util::defs::{TOKEN_IDENT, TOKEN_NUMBER};
match self.0 {
// Identifiers and numbers compare both token and lexeme
TOKEN_IDENT | TOKEN_NUMBER => self.0 == other.0 && self.1 == other.1,
_ => self.0 == other.0,
}
}
}
impl Eq for Lexeme {}
trait AsLexeme {
fn as_lexeme(self) -> Option<Lexeme>;
}
#[repr(C)]
struct LexemeRaw {
token: u8,
lexeme: *const u8,
len: usize,
}
impl AsLexeme for LexemeRaw {
fn as_lexeme(self) -> Option<Lexeme> {
let Self { token, lexeme, len } = self;
let slice =
unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
match token {
1.. => Some(Lexeme(token, slice)),
_ => None,
}
}
}
#[allow(dead_code)]
unsafe extern "C" {
unsafe fn tokeniser_init(path: *const i8) -> ();
unsafe fn tokeniser_init_buf(bytes: *const u8, len: usize) -> ();
unsafe fn tokeniser_print() -> ();
unsafe fn is_ident(len: usize) -> bool;
unsafe fn is_number(len: usize) -> bool;
unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> LexemeRaw;
unsafe fn expect_token(token: u8) -> MaybeFFISlice;
unsafe fn unwrap_token(token: u8) -> FFISlice;
static mut LEXEMES: *const u8;
static mut LEXEME_LENS: usize;
static mut NUM_LEXEMES: usize;
static mut TOKENS: u8;
static mut input_file: u32;
static mut buffer: *mut u8;
static mut cursor: usize;
static mut buffer_len: usize;
unsafe fn exit(code: i32) -> !;
}
fn collect_tokens() -> Vec<Lexeme> {
let mut lexemes = Vec::new();
unsafe {
while let Some(lexeme) = find_lexeme().as_lexeme() {
lexemes.push(lexeme);
}
}
lexemes
}
fn main() {
unsafe {
use util::defs::*;
// assert initial state
assert_eq!((&raw const input_file).read(), 0);
assert_eq!((&raw const buffer_len).read(), 0);
assert_eq!((&raw const cursor).read(), 0);
assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(4, ""),
Lexeme(1, ""),
Lexeme(2, ""),
Lexeme(3, ""),
Lexeme(4, ""),
Lexeme(8, ""),
Lexeme(13, ""),
Lexeme(11, ""),
Lexeme(10, ""),
Lexeme(9, ""),
Lexeme(5, ""),
][..]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_LPARENS, ""),
Lexeme(TOKEN_RPARENS, ""),
Lexeme(TOKEN_LBRACKET, ""),
Lexeme(TOKEN_RBRACKET, ""),
Lexeme(TOKEN_LBRACE, ""),
Lexeme(TOKEN_RBRACE, ""),
Lexeme(TOKEN_COMMA, ""),
Lexeme(TOKEN_ARROW, ""),
Lexeme(TOKEN_SEMI, ""),
Lexeme(TOKEN_COLON, ""),
Lexeme(TOKEN_EQUALS, ""),
][..]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_IDENT, "this-is-an-ident"),
Lexeme(TOKEN_IDENT, "another_ident123"),
Lexeme(TOKEN_IDENT, "_underscore_test"),
Lexeme(TOKEN_IDENT, "mixedCASEIdent"),
Lexeme(TOKEN_IDENT, "number12345"),
Lexeme(TOKEN_IDENT, "____"),
Lexeme(TOKEN_IDENT, "_"),
Lexeme(TOKEN_MINUS, ""),
Lexeme(TOKEN_IDENT, "leading-minus"),
Lexeme(TOKEN_IDENT, "trailing-minus-"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/function.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_FN, ""),
Lexeme(TOKEN_IDENT, "my-function"),
Lexeme(TOKEN_LPARENS, ""),
Lexeme(TOKEN_RPARENS, ""),
Lexeme(TOKEN_ARROW, ""),
Lexeme(TOKEN_BOOL, ""),
Lexeme(TOKEN_LBRACE, ""),
Lexeme(TOKEN_RETURN, ""),
Lexeme(TOKEN_FALSE, ""),
Lexeme(TOKEN_SEMI, ""),
Lexeme(TOKEN_RBRACE, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/function.l".as_ptr());
eprintln!("ok.");
assert_eq!(expect_token(TOKEN_IF).into_option(), None);
assert_eq!(expect_token(TOKEN_FN).into_option().unwrap().as_str(), "fn");
assert_eq!(unwrap_token(TOKEN_IDENT).as_str(), "my-function");
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/comment.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_COMMENT, ""),
Lexeme(TOKEN_FN, ""),
Lexeme(TOKEN_IDENT, "my-function"),
Lexeme(TOKEN_LPARENS, ""),
Lexeme(TOKEN_RPARENS, ""),
Lexeme(TOKEN_ARROW, ""),
Lexeme(TOKEN_BOOL, ""),
Lexeme(TOKEN_LBRACE, ""),
Lexeme(TOKEN_COMMENT, ""),
Lexeme(TOKEN_RETURN, ""),
Lexeme(TOKEN_FALSE, ""),
Lexeme(TOKEN_SEMI, ""),
Lexeme(TOKEN_RBRACE, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/number.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_NUMBER, "1234"),
Lexeme(TOKEN_NUMBER, "123_345_"),
Lexeme(TOKEN_NUMBER, "1234____56"),
Lexeme(TOKEN_NUMBER, "1"),
Lexeme(TOKEN_NUMBER, "0"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/strings.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_STRING, "\"this is a string\""),
Lexeme(
TOKEN_STRING,
"\"another\nstring\nspanning multiple\n lines\""
),
Lexeme(TOKEN_STRING, "\"string with a \\\"quoted\\\" word\""),
Lexeme(TOKEN_STRING, "\"a\""),
Lexeme(TOKEN_STRING, "\"\"")
],
);
eprint!("Initializing tokeniser.. ");
let src = b"3 + 4";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_NUMBER, "3"),
Lexeme(TOKEN_PLUS, "+"),
Lexeme(TOKEN_NUMBER, "4")
],
);
eprint!("Initializing tokeniser.. ");
let src = b"fn main() -> void { return 1 + 2; }";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_FN, "fn"),
Lexeme(TOKEN_IDENT, "main"),
Lexeme(TOKEN_LPARENS, "("),
Lexeme(TOKEN_RPARENS, ")"),
Lexeme(TOKEN_ARROW, "->"),
Lexeme(TOKEN_VOID, "void"),
Lexeme(TOKEN_LBRACE, "{"),
Lexeme(TOKEN_RETURN, "return"),
Lexeme(TOKEN_NUMBER, "1"),
Lexeme(TOKEN_PLUS, "+"),
Lexeme(TOKEN_NUMBER, "2"),
Lexeme(TOKEN_SEMI, ";"),
Lexeme(TOKEN_RBRACE, "}"),
],
);
eprint!("Initializing tokeniser.. ");
let src = b"(b / d + c) * 42;";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_LPARENS, "("),
Lexeme(TOKEN_IDENT, "b"),
Lexeme(TOKEN_SLASH, "/"),
Lexeme(TOKEN_IDENT, "d"),
Lexeme(TOKEN_PLUS, "+"),
Lexeme(TOKEN_IDENT, "c"),
Lexeme(TOKEN_RPARENS, ")"),
Lexeme(TOKEN_STAR, "*"),
Lexeme(TOKEN_NUMBER, "42"),
Lexeme(TOKEN_SEMI, ";")
],
);
eprint!("Initializing tokeniser.. ");
let src = b"<<<=<a == b = c ||| &||&&|&";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(TOKEN_LESSLESS, ""),
Lexeme(TOKEN_LEQ, ""),
Lexeme(TOKEN_LT, ""),
Lexeme(TOKEN_IDENT, "a"),
Lexeme(TOKEN_EQEQ, ""),
Lexeme(TOKEN_IDENT, "b"),
Lexeme(TOKEN_EQUALS, ""),
Lexeme(TOKEN_IDENT, "c"),
Lexeme(TOKEN_PIPE2, ""),
Lexeme(TOKEN_PIPE, ""),
Lexeme(TOKEN_AMP, ""),
Lexeme(TOKEN_PIPE2, ""),
Lexeme(TOKEN_AMP2, ""),
Lexeme(TOKEN_PIPE, ""),
Lexeme(TOKEN_AMP, ""),
],
);
eprintln!("Finished tokenising.");
}
}