more tests for tokeniser

2025-10-28 13:12:27 +01:00 · 2025-10-28 13:12:27 +01:00 · cfc1c8e3b3
parent 63fbca378b
commit cfc1c8e3b3
7 changed files with 171 additions and 129 deletions
--- a/lang/src/lib.asm
+++ b/lang/src/lib.asm
@ -372,6 +372,8 @@ is_id_continue:
    jne .is_id_continue_ret
    cmp cl, '_'
    je .is_id_continue_ret
    cmp cl, '-'
    je .is_id_continue_ret
    xor rax, rax
    ret
 .is_id_continue_ret:
--- a/lang/src/tokeniser.asm
+++ b/lang/src/tokeniser.asm
@ -114,98 +114,106 @@ global LEXEME_LENS
 global NUM_LEXEMES
 align 8
-LEXEMES: dq \
+LEXEMES:
-    LEX_NOT_A_LEXEME, \
+    dq LEX_NOT_A_LEXEME
-    LEX_LET, \
+    dq LEX_LET
-    LEX_IF, \
+    dq LEX_IF
-    LEX_ELSE, \
+    dq LEX_ELSE
-    LEX_FN, \
+    dq LEX_FN
-    LEX_RETURN, \
+    dq LEX_RETURN
-    LEX_LOOP, \
+    dq LEX_LOOP
-    LEX_BREAK, \
+    dq LEX_BREAK
-    LEX_CONTINUE, \
+    dq LEX_CONTINUE
-    LEX_TRUE, \
+    dq LEX_TRUE
-    LEX_FALSE, \
+    dq LEX_FALSE
-    LEX_BOOL, \
+    dq LEX_BOOL
-    LEX_ARROW, \
+    dq LEX_ARROW
-    LEX_I32, \
+    dq LEX_I32
-    LEX_U32, \
+    dq LEX_U32
-    LEX_EQUALS, \
+    dq LEX_EQUALS
-    LEX_PLUS, \
+    dq LEX_PLUS
-    LEX_MINUS, \
+    dq LEX_MINUS
-    LEX_RPARENS, \
+    dq LEX_RPARENS
-    LEX_LPARENS, \
+    dq LEX_LPARENS
-    LEX_RBRACE, \
+    dq LEX_RBRACE
-    LEX_LBRACE, \
+    dq LEX_LBRACE
-    LEX_COLON, \
+    dq LEX_COLON
-    LEX_SEMI, \
+    dq LEX_SEMI
-    LEX_COMMA, \
+    dq LEX_COMMA
-    LEX_PIPE, \
+    dq LEX_PIPE
-    LEX_AMP, \
+    dq LEX_AMP
-    LEX_EQEQ
+    dq LEX_EQEQ
-align 8
+    dq LEX_LBRACKET
-TOKENS: db \
+    dq LEX_RBRACKET
    TOKEN_EOF, \
    TOKEN_LET, \
    TOKEN_IF, \
    TOKEN_ELSE, \
    TOKEN_FN, \
    TOKEN_RETURN, \
    TOKEN_LOOP, \
    TOKEN_BREAK, \
    TOKEN_CONTINUE, \
    TOKEN_TRUE, \
    TOKEN_FALSE, \
    TOKEN_BOOL, \
    TOKEN_ARROW, \
    TOKEN_I32, \
    TOKEN_U32, \
    TOKEN_EQUALS, \
    TOKEN_PLUS, \
    TOKEN_MINUS, \
    TOKEN_RPARENS, \
    TOKEN_LPARENS, \
    TOKEN_RBRACE, \
    TOKEN_LBRACE, \
    TOKEN_COLON, \
    TOKEN_SEMI, \
    TOKEN_COMMA, \
    TOKEN_PIPE, \
    TOKEN_AMP, \
    TOKEN_EQEQ
 align 8
 LEXEME_LENS: dq \
    0, \
    LEX_LET_len, \
    LEX_IF_len, \
    LEX_ELSE_len, \
    LEX_FN_len, \
    LEX_RETURN_len, \
    LEX_LOOP_len, \
    LEX_BREAK_len, \
    LEX_CONTINUE_len, \
    LEX_TRUE_len, \
    LEX_FALSE_len, \
    LEX_BOOL_len, \
    LEX_ARROW_len, \
    LEX_I32_len, \
    LEX_U32_len, \
    LEX_EQUALS_len, \
    LEX_PLUS_len, \
    LEX_MINUS_len, \
    LEX_RPARENS_len, \
    LEX_LPARENS_len, \
    LEX_RBRACE_len, \
    LEX_LBRACE_len, \
    LEX_COLON_len, \
    LEX_SEMI_len, \
    LEX_COMMA_len, \
    LEX_PIPE_len, \
    LEX_AMP_len, \
    LEX_EQEQ_len
 align 8
-NUM_LEXEMES: dq 28
+TOKENS:
    db TOKEN_EOF                   ;; 0
    db TOKEN_LET                   ;; 1
    db TOKEN_IF                    ;; 2
    db TOKEN_ELSE                  ;; 3
    db TOKEN_FN                    ;; 4
    db TOKEN_RETURN                ;; 5
    db TOKEN_LOOP                  ;; 6
    db TOKEN_BREAK                 ;; 7
    db TOKEN_CONTINUE              ;; 8
    db TOKEN_TRUE                  ;; 9
    db TOKEN_FALSE                 ;; 10
    db TOKEN_BOOL                  ;; 11
    db TOKEN_ARROW                 ;; 12
    db TOKEN_I32                   ;; 13
    db TOKEN_U32                   ;; 14
    db TOKEN_EQUALS                ;; 15
    db TOKEN_PLUS                  ;; 16
    db TOKEN_MINUS                 ;; 17
    db TOKEN_RPARENS               ;; 18
    db TOKEN_LPARENS               ;; 19
    db TOKEN_RBRACE                ;; 20
    db TOKEN_LBRACE                ;; 21
    db TOKEN_COLON                 ;; 22
    db TOKEN_SEMI                  ;; 23
    db TOKEN_COMMA                 ;; 24
    db TOKEN_PIPE                  ;; 25
    db TOKEN_AMP                   ;; 26
    db TOKEN_EQEQ                  ;; 27
    db TOKEN_LBRACKET              ;; 28
    db TOKEN_RBRACKET              ;; 29
 align 8
 LEXEME_LENS:
    dq 0
    dq LEX_LET_len
    dq LEX_IF_len
    dq LEX_ELSE_len
    dq LEX_FN_len
    dq LEX_RETURN_len
    dq LEX_LOOP_len
    dq LEX_BREAK_len
    dq LEX_CONTINUE_len
    dq LEX_TRUE_len
    dq LEX_FALSE_len
    dq LEX_BOOL_len
    dq LEX_ARROW_len
    dq LEX_I32_len
    dq LEX_U32_len
    dq LEX_EQUALS_len
    dq LEX_PLUS_len
    dq LEX_MINUS_len
    dq LEX_RPARENS_len
    dq LEX_LPARENS_len
    dq LEX_RBRACE_len
    dq LEX_LBRACE_len
    dq LEX_COLON_len
    dq LEX_SEMI_len
    dq LEX_COMMA_len
    dq LEX_PIPE_len
    dq LEX_AMP_len
    dq LEX_EQEQ_len
    dq LEX_LBRACKET_len
    dq LEX_RBRACKET_len
 align 8
 NUM_LEXEMES: dq 30
    LEX_NOT_A_LEXEME db "<not a lexeme>", 0
    TOKEN_EOF       equ 0
@ -290,13 +298,20 @@ NUM_LEXEMES: dq 28
    TOKEN_EQEQ     equ 27
    LEX_EQEQ db "=="
    LEX_EQEQ_len equ $ - LEX_EQEQ
-    TOKEN_IDENT     equ 28
+    TOKEN_LBRACKET  equ 28
    LEX_LBRACKET db "["
    LEX_LBRACKET_len equ $ - LEX_LBRACKET
    TOKEN_RBRACKET  equ 29
    LEX_RBRACKET db "]"
    LEX_RBRACKET_len equ $ - LEX_RBRACKET
    TOKEN_IDENT     equ 30
    LEX_IDENT db "<identifier>"
    LEX_IDENT_len equ $ - LEX_IDENT
-    TOKEN_NUMBER    equ 29
+    TOKEN_NUMBER    equ 31
    LEX_NUMBER db "<number>"
    LEX_NUMBER_len equ $ - LEX_NUMBER
 section .text
 ;; rdi: length of matched lexeme
 is_ident:
--- a/lang/tests/tokens.rs
+++ b/lang/tests/tokens.rs
@ -3,6 +3,7 @@ extern "C" fn panic() -> ! {
    panic!("Called panic from external code.");
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 struct Lexeme(u8);
 impl Lexeme {
@ -27,7 +28,7 @@ trait AsLexeme {
 impl AsLexeme for u8 {
    fn as_lexeme(self) -> Option<Lexeme> {
        match self {
-            1..=10 => Some(Lexeme(self)),
+            1.. => Some(Lexeme(self)),
            _ => None,
        }
    }
@ -55,51 +56,61 @@ unsafe extern "C" {
    unsafe fn exit(code: i32) -> !;
 }
-// fn lexemes_raw() -> &'static [*const u8] {
+fn collect_tokens() -> Vec<Lexeme> {
-//     unsafe {
+    let mut lexemes = Vec::new();
-//         core::slice::from_raw_parts(
+    unsafe {
-//             (&raw const LEXEMES),
+        while let Some(lexeme) = find_lexeme().as_lexeme() {
-//             (&raw const NUM_LEXEMES).read(),
+            lexemes.push(lexeme);
-//         )
+        }
-//     }
+    }
 // }
-// fn lexeme_lens() -> &'static [usize] {
+    lexemes
-//     unsafe {
+}
 //         core::slice::from_raw_parts(
 //             (&raw const LEXEME_LENS),
 //             (&raw const NUM_LEXEMES).read(),
 //         )
 //     }
 // }
 // fn lexeme_iter() -> impl Iterator<Item = &'static str> {
 //     lexemes_raw().iter().zip(lexeme_lens().iter()).map(|(&ptr, &len)| {
 //         // SAFETY: lexemes_raw and lexeme_lens are guaranteed to contain valid
 //         // UTF-8 data and correct lengths.
 //         unsafe {
 //             core::str::from_utf8_unchecked(core::slice::from_raw_parts(ptr, len))
 //         }
 //     })
 // }
 fn main() {
    let path = c"tests/tokens.l";
    unsafe {
        // assert initial state
        assert_eq!((&raw const input_file).read(), 0);
        assert_eq!((&raw const buffer_len).read(), 0);
        assert_eq!((&raw const cursor).read(), 0);
        assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(path.as_ptr());
        eprintln!("ok.");
        eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read());
        tokeniser_print();
-        while let Some(lexeme) = find_lexeme().as_lexeme() {
+        eprint!("Initializing tokeniser.. ");
-            eprintln!("Found lexeme: {}", lexeme.lex());
+        tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
-        }
+        eprintln!("ok.");
        assert_eq!(&collect_tokens()[..], &[
            Lexeme(4),
            Lexeme(1),
            Lexeme(2),
            Lexeme(3),
            Lexeme(4),
            Lexeme(8),
            Lexeme(13),
            Lexeme(11),
            Lexeme(10),
            Lexeme(9),
            Lexeme(5),
        ][..]);
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
        eprintln!("ok.");
        assert_eq!(&collect_tokens()[..], &[
            Lexeme(19),
            Lexeme(18),
            Lexeme(28),
            Lexeme(29),
            Lexeme(21),
            Lexeme(20),
            Lexeme(24),
            Lexeme(12),
            Lexeme(23),
            Lexeme(22),
            Lexeme(15),
        ][..]);
        eprintln!("Finished tokenising.");
    }
 }
--- a/lang/tests/tokens/delimiters.l
+++ b/lang/tests/tokens/delimiters.l
@ -0,0 +1 @@
 ()[]{},->;:=
--- a/lang/tests/tokens/function.l
+++ b/lang/tests/tokens/function.l
@ -0,0 +1,3 @@
 fn my-function() -> bool {
  return false;
 }
--- a/lang/tests/tokens/identifier.l
+++ b/lang/tests/tokens/identifier.l
@ -0,0 +1,7 @@
 this-is-an-ident
 another_ident123
 _underscore_test
 mixedCASEIdent
 number12345
 ____
 _
--- a/lang/tests/tokens/keywords.l
+++ b/lang/tests/tokens/keywords.l
@ -0,0 +1,3 @@
 fn let if else fn continue
 i32      bool false true
        return