from-scratch/lang/tests/tokens.rs

#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
    panic!("Called panic from external code.");
}

#[derive(Debug)]
struct Lexeme(u8, &'static str);

impl PartialEq for Lexeme {
    fn eq(&self, other: &Self) -> bool {
        match self.0 {
            // Identifiers and numbers compare both token and lexeme
            30 | 31 => self.0 == other.0 && self.1 == other.1,
            _ => self.0 == other.0,

        }
    }
}

impl Eq for Lexeme {}

impl Lexeme {
    fn lex(&self) -> &'static str {
        self.1
    }
}

trait AsLexeme {
    fn as_lexeme(self) -> Option<Lexeme>;
}

#[repr(C)]
struct LexemeRaw {
    token: u8,
    lexeme: *const u8,
    len: usize,
}
impl AsLexeme for LexemeRaw {
    fn as_lexeme(self) -> Option<Lexeme> {
        let Self {token, lexeme, len} = self;
        let slice = unsafe {core::str::from_utf8_unchecked(
            core::slice::from_raw_parts(
                lexeme, len))};
        match token {
            1.. => Some(Lexeme(token, slice)),
            _ => None,
        }
    }
}

#[allow(dead_code)]
unsafe extern "C" {
    unsafe fn tokeniser_init(path: *const i8) -> ();
    unsafe fn tokeniser_print() -> ();
    unsafe fn is_ident(len: usize) -> bool;
    unsafe fn is_number(len: usize) -> bool;
    unsafe fn skip_whitespace() -> ();

    unsafe fn find_lexeme() -> LexemeRaw;

    static mut LEXEMES: *const u8;
    static mut LEXEME_LENS: usize;
    static mut NUM_LEXEMES: usize;
    static mut TOKENS: u8;

    static mut input_file: u32;
    static mut buffer: *mut u8;
    static mut cursor: usize;
    static mut buffer_len: usize;

    unsafe fn exit(code: i32) -> !;
}

fn collect_tokens() -> Vec<Lexeme> {
    let mut lexemes = Vec::new();
    unsafe {
        while let Some(lexeme) = find_lexeme().as_lexeme() {
            lexemes.push(lexeme);
        }
    }

    lexemes
}

fn main() {
    unsafe {
        // assert initial state
        assert_eq!((&raw const input_file).read(), 0);
        assert_eq!((&raw const buffer_len).read(), 0);
        assert_eq!((&raw const cursor).read(), 0);
        assert_eq!((&raw const buffer).read(), core::ptr::null_mut());

        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
        eprintln!("ok.");

        assert_eq!(&collect_tokens()[..], &[
            Lexeme(4, ""),
            Lexeme(1, ""),
            Lexeme(2, ""),
            Lexeme(3, ""),
            Lexeme(4, ""),
            Lexeme(8, ""),
            Lexeme(13, ""),
            Lexeme(11, ""),
            Lexeme(10, ""),
            Lexeme(9, ""),
            Lexeme(5, ""),
        ][..]);

        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
        eprintln!("ok.");

        assert_eq!(&collect_tokens()[..], &[
            Lexeme(19, ""),
            Lexeme(18, ""),
            Lexeme(28, ""),
            Lexeme(29, ""),
            Lexeme(21, ""),
            Lexeme(20, ""),
            Lexeme(24, ""),
            Lexeme(12, ""),
            Lexeme(23, ""),
            Lexeme(22, ""),
            Lexeme(15, ""),
        ][..]);

        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
        eprintln!("ok.");

        assert_eq!(&collect_tokens()[..], &[
            Lexeme(30, "this-is-an-ident"),
            Lexeme(30, "another_ident123"),
            Lexeme(30, "_underscore_test"),
            Lexeme(30, "mixedCASEIdent"),
            Lexeme(30, "number12345"),
            Lexeme(30, "____"),
            Lexeme(30, "_"),
            Lexeme(17, ""), Lexeme(30, "leading-minus"),
            Lexeme(30, "trailing-minus-"),
        ]);

        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/function.l".as_ptr());
        eprintln!("ok.");

        assert_eq!(&collect_tokens()[..], &[
            Lexeme(4, ""),
            Lexeme(30, "my-function"),
            Lexeme(19, ""),
            Lexeme(18, ""),
            Lexeme(12, ""),
            Lexeme(11, ""),
            Lexeme(21, ""),
            Lexeme(5, ""),
            Lexeme(10, ""),
            Lexeme(23, ""),
            Lexeme(20, ""),
            ]);

        eprintln!("Finished tokenising.");
    }
}