more tests for tokeniser

2025-10-28 13:12:27 +01:00 · 2025-10-28 13:12:27 +01:00 · cfc1c8e3b3
parent 63fbca378b
commit cfc1c8e3b3
7 changed files with 171 additions and 129 deletions
--- a/lang/src/lib.asm
+++ b/lang/src/lib.asm
@ -372,6 +372,8 @@ is_id_continue:
    jne .is_id_continue_ret
    cmp cl, '_'
    je .is_id_continue_ret
+    cmp cl, '-'
+    je .is_id_continue_ret
    xor rax, rax
    ret
 .is_id_continue_ret:
--- a/lang/src/tokeniser.asm
+++ b/lang/src/tokeniser.asm
@ -114,98 +114,106 @@ global LEXEME_LENS
 global NUM_LEXEMES

 align 8
-LEXEMES: dq \
-    LEX_NOT_A_LEXEME, \
-    LEX_LET, \
-    LEX_IF, \
-    LEX_ELSE, \
-    LEX_FN, \
-    LEX_RETURN, \
-    LEX_LOOP, \
-    LEX_BREAK, \
-    LEX_CONTINUE, \
-    LEX_TRUE, \
-    LEX_FALSE, \
-    LEX_BOOL, \
-    LEX_ARROW, \
-    LEX_I32, \
-    LEX_U32, \
-    LEX_EQUALS, \
-    LEX_PLUS, \
-    LEX_MINUS, \
-    LEX_RPARENS, \
-    LEX_LPARENS, \
-    LEX_RBRACE, \
-    LEX_LBRACE, \
-    LEX_COLON, \
-    LEX_SEMI, \
-    LEX_COMMA, \
-    LEX_PIPE, \
-    LEX_AMP, \
-    LEX_EQEQ
-align 8
-TOKENS: db \
-    TOKEN_EOF, \
-    TOKEN_LET, \
-    TOKEN_IF, \
-    TOKEN_ELSE, \
-    TOKEN_FN, \
-    TOKEN_RETURN, \
-    TOKEN_LOOP, \
-    TOKEN_BREAK, \
-    TOKEN_CONTINUE, \
-    TOKEN_TRUE, \
-    TOKEN_FALSE, \
-    TOKEN_BOOL, \
-    TOKEN_ARROW, \
-    TOKEN_I32, \
-    TOKEN_U32, \
-    TOKEN_EQUALS, \
-    TOKEN_PLUS, \
-    TOKEN_MINUS, \
-    TOKEN_RPARENS, \
-    TOKEN_LPARENS, \
-    TOKEN_RBRACE, \
-    TOKEN_LBRACE, \
-    TOKEN_COLON, \
-    TOKEN_SEMI, \
-    TOKEN_COMMA, \
-    TOKEN_PIPE, \
-    TOKEN_AMP, \
-    TOKEN_EQEQ
-align 8
-LEXEME_LENS: dq \
-    0, \
-    LEX_LET_len, \
-    LEX_IF_len, \
-    LEX_ELSE_len, \
-    LEX_FN_len, \
-    LEX_RETURN_len, \
-    LEX_LOOP_len, \
-    LEX_BREAK_len, \
-    LEX_CONTINUE_len, \
-    LEX_TRUE_len, \
-    LEX_FALSE_len, \
-    LEX_BOOL_len, \
-    LEX_ARROW_len, \
-    LEX_I32_len, \
-    LEX_U32_len, \
-    LEX_EQUALS_len, \
-    LEX_PLUS_len, \
-    LEX_MINUS_len, \
-    LEX_RPARENS_len, \
-    LEX_LPARENS_len, \
-    LEX_RBRACE_len, \
-    LEX_LBRACE_len, \
-    LEX_COLON_len, \
-    LEX_SEMI_len, \
-    LEX_COMMA_len, \
-    LEX_PIPE_len, \
-    LEX_AMP_len, \
-    LEX_EQEQ_len
+LEXEMES:
+    dq LEX_NOT_A_LEXEME
+    dq LEX_LET
+    dq LEX_IF
+    dq LEX_ELSE
+    dq LEX_FN
+    dq LEX_RETURN
+    dq LEX_LOOP
+    dq LEX_BREAK
+    dq LEX_CONTINUE
+    dq LEX_TRUE
+    dq LEX_FALSE
+    dq LEX_BOOL
+    dq LEX_ARROW
+    dq LEX_I32
+    dq LEX_U32
+    dq LEX_EQUALS
+    dq LEX_PLUS
+    dq LEX_MINUS
+    dq LEX_RPARENS
+    dq LEX_LPARENS
+    dq LEX_RBRACE
+    dq LEX_LBRACE
+    dq LEX_COLON
+    dq LEX_SEMI
+    dq LEX_COMMA
+    dq LEX_PIPE
+    dq LEX_AMP
+    dq LEX_EQEQ
+    dq LEX_LBRACKET
+    dq LEX_RBRACKET

 align 8
-NUM_LEXEMES: dq 28
+TOKENS:
+    db TOKEN_EOF                   ;; 0
+    db TOKEN_LET                   ;; 1
+    db TOKEN_IF                    ;; 2
+    db TOKEN_ELSE                  ;; 3
+    db TOKEN_FN                    ;; 4
+    db TOKEN_RETURN                ;; 5
+    db TOKEN_LOOP                  ;; 6
+    db TOKEN_BREAK                 ;; 7
+    db TOKEN_CONTINUE              ;; 8
+    db TOKEN_TRUE                  ;; 9
+    db TOKEN_FALSE                 ;; 10
+    db TOKEN_BOOL                  ;; 11
+    db TOKEN_ARROW                 ;; 12
+    db TOKEN_I32                   ;; 13
+    db TOKEN_U32                   ;; 14
+    db TOKEN_EQUALS                ;; 15
+    db TOKEN_PLUS                  ;; 16
+    db TOKEN_MINUS                 ;; 17
+    db TOKEN_RPARENS               ;; 18
+    db TOKEN_LPARENS               ;; 19
+    db TOKEN_RBRACE                ;; 20
+    db TOKEN_LBRACE                ;; 21
+    db TOKEN_COLON                 ;; 22
+    db TOKEN_SEMI                  ;; 23
+    db TOKEN_COMMA                 ;; 24
+    db TOKEN_PIPE                  ;; 25
+    db TOKEN_AMP                   ;; 26
+    db TOKEN_EQEQ                  ;; 27
+    db TOKEN_LBRACKET              ;; 28
+    db TOKEN_RBRACKET              ;; 29
+
+align 8
+LEXEME_LENS:
+    dq 0
+    dq LEX_LET_len
+    dq LEX_IF_len
+    dq LEX_ELSE_len
+    dq LEX_FN_len
+    dq LEX_RETURN_len
+    dq LEX_LOOP_len
+    dq LEX_BREAK_len
+    dq LEX_CONTINUE_len
+    dq LEX_TRUE_len
+    dq LEX_FALSE_len
+    dq LEX_BOOL_len
+    dq LEX_ARROW_len
+    dq LEX_I32_len
+    dq LEX_U32_len
+    dq LEX_EQUALS_len
+    dq LEX_PLUS_len
+    dq LEX_MINUS_len
+    dq LEX_RPARENS_len
+    dq LEX_LPARENS_len
+    dq LEX_RBRACE_len
+    dq LEX_LBRACE_len
+    dq LEX_COLON_len
+    dq LEX_SEMI_len
+    dq LEX_COMMA_len
+    dq LEX_PIPE_len
+    dq LEX_AMP_len
+    dq LEX_EQEQ_len
+    dq LEX_LBRACKET_len
+    dq LEX_RBRACKET_len
+
+align 8
+NUM_LEXEMES: dq 30

    LEX_NOT_A_LEXEME db "<not a lexeme>", 0
    TOKEN_EOF       equ 0
@ -290,12 +298,19 @@ NUM_LEXEMES: dq 28
    TOKEN_EQEQ     equ 27
    LEX_EQEQ db "=="
    LEX_EQEQ_len equ $ - LEX_EQEQ
-    TOKEN_IDENT     equ 28
+    TOKEN_LBRACKET  equ 28
+    LEX_LBRACKET db "["
+    LEX_LBRACKET_len equ $ - LEX_LBRACKET
+    TOKEN_RBRACKET  equ 29
+    LEX_RBRACKET db "]"
+    LEX_RBRACKET_len equ $ - LEX_RBRACKET
+    TOKEN_IDENT     equ 30
    LEX_IDENT db "<identifier>"
    LEX_IDENT_len equ $ - LEX_IDENT
-    TOKEN_NUMBER    equ 29
+    TOKEN_NUMBER    equ 31
    LEX_NUMBER db "<number>"
    LEX_NUMBER_len equ $ - LEX_NUMBER
+    

 section .text
 ;; rdi: length of matched lexeme
--- a/lang/tests/tokens.rs
+++ b/lang/tests/tokens.rs
@ -3,6 +3,7 @@ extern "C" fn panic() -> ! {
    panic!("Called panic from external code.");
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 struct Lexeme(u8);

 impl Lexeme {
@ -27,7 +28,7 @@ trait AsLexeme {
 impl AsLexeme for u8 {
    fn as_lexeme(self) -> Option<Lexeme> {
        match self {
-            1..=10 => Some(Lexeme(self)),
+            1.. => Some(Lexeme(self)),
            _ => None,
        }
    }
@ -55,51 +56,61 @@ unsafe extern "C" {
    unsafe fn exit(code: i32) -> !;
 }

-// fn lexemes_raw() -> &'static [*const u8] {
-//     unsafe {
-//         core::slice::from_raw_parts(
-//             (&raw const LEXEMES),
-//             (&raw const NUM_LEXEMES).read(),
-//         )
-//     }
-// }
+fn collect_tokens() -> Vec<Lexeme> {
+    let mut lexemes = Vec::new();
+    unsafe {
+        while let Some(lexeme) = find_lexeme().as_lexeme() {
+            lexemes.push(lexeme);
+        }
+    }

-// fn lexeme_lens() -> &'static [usize] {
-//     unsafe {
-//         core::slice::from_raw_parts(
-//             (&raw const LEXEME_LENS),
-//             (&raw const NUM_LEXEMES).read(),
-//         )
-//     }
-// }
-
-// fn lexeme_iter() -> impl Iterator<Item = &'static str> {
-//     lexemes_raw().iter().zip(lexeme_lens().iter()).map(|(&ptr, &len)| {
-//         // SAFETY: lexemes_raw and lexeme_lens are guaranteed to contain valid
-//         // UTF-8 data and correct lengths.
-//         unsafe {
-//             core::str::from_utf8_unchecked(core::slice::from_raw_parts(ptr, len))
-//         }
-//     })
-// }
+    lexemes
+}

 fn main() {
-    let path = c"tests/tokens.l";
-
    unsafe {
+        // assert initial state
        assert_eq!((&raw const input_file).read(), 0);
        assert_eq!((&raw const buffer_len).read(), 0);
        assert_eq!((&raw const cursor).read(), 0);
        assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
-        eprint!("Initializing tokeniser.. ");
-        tokeniser_init(path.as_ptr());
-        eprintln!("ok.");
-        eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read());
-        tokeniser_print();

-        while let Some(lexeme) = find_lexeme().as_lexeme() {
-            eprintln!("Found lexeme: {}", lexeme.lex());
-        }
+        eprint!("Initializing tokeniser.. ");
+        tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
+        eprintln!("ok.");
+
+        assert_eq!(&collect_tokens()[..], &[
+            Lexeme(4),
+            Lexeme(1),
+            Lexeme(2),
+            Lexeme(3),
+            Lexeme(4),
+            Lexeme(8),
+            Lexeme(13),
+            Lexeme(11),
+            Lexeme(10),
+            Lexeme(9),
+            Lexeme(5),
+        ][..]);
+
+        eprint!("Initializing tokeniser.. ");
+        tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
+        eprintln!("ok.");
+
+        assert_eq!(&collect_tokens()[..], &[
+            Lexeme(19),
+            Lexeme(18),
+            Lexeme(28),
+            Lexeme(29),
+            Lexeme(21),
+            Lexeme(20),
+            Lexeme(24),
+            Lexeme(12),
+            Lexeme(23),
+            Lexeme(22),
+            Lexeme(15),
+        ][..]);
+
        eprintln!("Finished tokenising.");
    }
 }
--- a/lang/tests/tokens/delimiters.l
+++ b/lang/tests/tokens/delimiters.l
@ -0,0 +1 @@
+()[]{},->;:=
--- a/lang/tests/tokens/function.l
+++ b/lang/tests/tokens/function.l
@ -0,0 +1,3 @@
+fn my-function() -> bool {
+  return false;
+}
--- a/lang/tests/tokens/identifier.l
+++ b/lang/tests/tokens/identifier.l
@ -0,0 +1,7 @@
+this-is-an-ident
+another_ident123
+_underscore_test
+mixedCASEIdent
+number12345
+____
+_
--- a/lang/tests/tokens/keywords.l
+++ b/lang/tests/tokens/keywords.l
@ -0,0 +1,3 @@
+fn let if else fn continue
+i32      bool false true
+        return