can tokenise simple tokens

2025-10-28 12:44:08 +01:00 · 2025-10-28 12:44:08 +01:00 · 63fbca378b
parent 719451b935
commit 63fbca378b
3 changed files with 53 additions and 18 deletions
--- a/lang/src/lib.asm
+++ b/lang/src/lib.asm
@ -393,7 +393,7 @@ is_id_start:

 ;; dil: byte to check
 is_whitespace:
-    cmp dil, ' '
+    cmp dil, 32     ; space
    je .is_ws
    cmp dil, 9      ; tab
    je .is_ws
--- a/lang/src/tokeniser.asm
+++ b/lang/src/tokeniser.asm
@ -112,7 +112,8 @@ global LEXEMES
 global TOKENS
 global LEXEME_LENS
 global NUM_LEXEMES
-    
+
+align 8
 LEXEMES: dq \
    LEX_NOT_A_LEXEME, \
    LEX_LET, \
@ -142,6 +143,7 @@ LEXEMES: dq \
    LEX_PIPE, \
    LEX_AMP, \
    LEX_EQEQ
+align 8
 TOKENS: db \
    TOKEN_EOF, \
    TOKEN_LET, \
@ -171,6 +173,7 @@ TOKENS: db \
    TOKEN_PIPE, \
    TOKEN_AMP, \
    TOKEN_EQEQ
+align 8
 LEXEME_LENS: dq \
    0, \
    LEX_LET_len, \
@ -201,7 +204,8 @@ LEXEME_LENS: dq \
    LEX_AMP_len, \
    LEX_EQEQ_len

-    NUM_LEXEMES equ 28
+align 8
+NUM_LEXEMES: dq 28

    LEX_NOT_A_LEXEME db "<not a lexeme>", 0
    TOKEN_EOF       equ 0
@ -353,24 +357,25 @@ skip_whitespaces:
    mov rbp, rsp
    push r12
    push r13
+    push r14
    ; let start = buffer.add(cursor);
    ; let end = buffer.add(buffer_len);
-    mov rax, [rel cursor]
-    mov r12, [rel buffer]
+    mov r12, [rel cursor]
    mov r13, [rel buffer_len]
-    add r13, r12                ; end
-    add r12, rax                ; start
+    mov r14, [rel buffer]
    ; for ptr in start..end {
 .loop:
    cmp r12, r13
    jge .done
-    mov dil, [r12]
+    mov dil, [r14 + r12]
    call is_whitespace
    test rax, rax
    je .done
    inc r12
    jmp .loop
 .done:
+    mov [rel cursor], r12
+    pop r14
    pop r13
    pop r12
    pop rbp
@ -399,7 +404,7 @@ find_lexeme:
    mov r12, 1
    ; for 1..NUM_LEXEMES {
 .loop:
-    cmp r12, NUM_LEXEMES
+    cmp r12, [rel NUM_LEXEMES]
    jge .not_found
    ;   let lexeme = LEXEMES[i];
    lea rdi, [rel LEXEMES]
@ -415,8 +420,8 @@ find_lexeme:
    jo .not_found
    ;   if lexeme.len() > buffer.len() - cursor {
    cmp rsi, rcx
-    jg .not_found
-    ;     goto .not_found
+    jg .next
+    ;     continue;
    ;   }
    mov rcx, rsi
    ;   if buffer[cursor..cursor+len] == lexeme {
--- a/lang/tests/tokens.rs
+++ b/lang/tests/tokens.rs
@ -12,8 +12,8 @@ impl Lexeme {
        unsafe {
            core::str::from_utf8_unchecked(
                core::slice::from_raw_parts(
-                    (&raw const LEXEMES).read().add((self.0) as usize).read(),
-                    (&raw const LEXEME_LENS).read().add((self.0) as usize).read(),
+                    (&raw const LEXEMES).add((self.0) as usize).read(),
+                    (&raw const LEXEME_LENS).add((self.0) as usize).read(),
                )
            )
        }
@ -42,10 +42,10 @@ unsafe extern "C" {
    unsafe fn skip_whitespace() -> ();
    unsafe fn find_lexeme() -> u8;

-    static mut LEXEMES: *mut *const u8;
-    static mut LEXEME_LENS: *mut usize;
+    static mut LEXEMES: *const u8;
+    static mut LEXEME_LENS: usize;
    static mut NUM_LEXEMES: usize;
-    static mut TOKENS: *mut u32;
+    static mut TOKENS: u8;

    static mut input_file: u32;
    static mut buffer: *mut u8;
@ -55,8 +55,37 @@ unsafe extern "C" {
    unsafe fn exit(code: i32) -> !;
 }

+// fn lexemes_raw() -> &'static [*const u8] {
+//     unsafe {
+//         core::slice::from_raw_parts(
+//             (&raw const LEXEMES),
+//             (&raw const NUM_LEXEMES).read(),
+//         )
+//     }
+// }
+
+// fn lexeme_lens() -> &'static [usize] {
+//     unsafe {
+//         core::slice::from_raw_parts(
+//             (&raw const LEXEME_LENS),
+//             (&raw const NUM_LEXEMES).read(),
+//         )
+//     }
+// }
+
+// fn lexeme_iter() -> impl Iterator<Item = &'static str> {
+//     lexemes_raw().iter().zip(lexeme_lens().iter()).map(|(&ptr, &len)| {
+//         // SAFETY: lexemes_raw and lexeme_lens are guaranteed to contain valid
+//         // UTF-8 data and correct lengths.
+//         unsafe {
+//             core::str::from_utf8_unchecked(core::slice::from_raw_parts(ptr, len))
+//         }
+//     })
+// }
+
 fn main() {
    let path = c"tests/tokens.l";
+
    unsafe {
        assert_eq!((&raw const input_file).read(), 0);
        assert_eq!((&raw const buffer_len).read(), 0);
@ -68,8 +97,9 @@ fn main() {
        eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read());
        tokeniser_print();

-        find_lexeme().as_lexeme().map(|lexeme| {
+        while let Some(lexeme) = find_lexeme().as_lexeme() {
            eprintln!("Found lexeme: {}", lexeme.lex());
-        });
+        }
+        eprintln!("Finished tokenising.");
    }
 }