From 63fbca378bb1cdf60dddb2f9f470d4038225f16d Mon Sep 17 00:00:00 2001 From: janis Date: Tue, 28 Oct 2025 12:44:08 +0100 Subject: [PATCH] can tokenise simple tokens --- lang/src/lib.asm | 2 +- lang/src/tokeniser.asm | 25 ++++++++++++++---------- lang/tests/tokens.rs | 44 +++++++++++++++++++++++++++++++++++------- 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/lang/src/lib.asm b/lang/src/lib.asm index 1f60a21..fbee857 100644 --- a/lang/src/lib.asm +++ b/lang/src/lib.asm @@ -393,7 +393,7 @@ is_id_start: ;; dil: byte to check is_whitespace: - cmp dil, ' ' + cmp dil, 32 ; space je .is_ws cmp dil, 9 ; tab je .is_ws diff --git a/lang/src/tokeniser.asm b/lang/src/tokeniser.asm index 63a21ad..0cc0099 100644 --- a/lang/src/tokeniser.asm +++ b/lang/src/tokeniser.asm @@ -112,7 +112,8 @@ global LEXEMES global TOKENS global LEXEME_LENS global NUM_LEXEMES - + +align 8 LEXEMES: dq \ LEX_NOT_A_LEXEME, \ LEX_LET, \ @@ -142,6 +143,7 @@ LEXEMES: dq \ LEX_PIPE, \ LEX_AMP, \ LEX_EQEQ +align 8 TOKENS: db \ TOKEN_EOF, \ TOKEN_LET, \ @@ -171,6 +173,7 @@ TOKENS: db \ TOKEN_PIPE, \ TOKEN_AMP, \ TOKEN_EQEQ +align 8 LEXEME_LENS: dq \ 0, \ LEX_LET_len, \ @@ -201,7 +204,8 @@ LEXEME_LENS: dq \ LEX_AMP_len, \ LEX_EQEQ_len - NUM_LEXEMES equ 28 +align 8 +NUM_LEXEMES: dq 28 LEX_NOT_A_LEXEME db "", 0 TOKEN_EOF equ 0 @@ -353,24 +357,25 @@ skip_whitespaces: mov rbp, rsp push r12 push r13 + push r14 ; let start = buffer.add(cursor); ; let end = buffer.add(buffer_len); - mov rax, [rel cursor] - mov r12, [rel buffer] + mov r12, [rel cursor] mov r13, [rel buffer_len] - add r13, r12 ; end - add r12, rax ; start + mov r14, [rel buffer] ; for ptr in start..end { .loop: cmp r12, r13 jge .done - mov dil, [r12] + mov dil, [r14 + r12] call is_whitespace test rax, rax je .done inc r12 jmp .loop .done: + mov [rel cursor], r12 + pop r14 pop r13 pop r12 pop rbp @@ -399,7 +404,7 @@ find_lexeme: mov r12, 1 ; for 1..NUM_LEXEMES { .loop: - cmp r12, NUM_LEXEMES + cmp r12, [rel NUM_LEXEMES] jge .not_found ; let lexeme = LEXEMES[i]; lea rdi, [rel LEXEMES] @@ -415,8 +420,8 @@ find_lexeme: jo .not_found ; if lexeme.len() > buffer.len() - cursor { cmp rsi, rcx - jg .not_found - ; goto .not_found + jg .next + ; continue; ; } mov rcx, rsi ; if buffer[cursor..cursor+len] == lexeme { diff --git a/lang/tests/tokens.rs b/lang/tests/tokens.rs index 6464d59..489f6ff 100644 --- a/lang/tests/tokens.rs +++ b/lang/tests/tokens.rs @@ -12,8 +12,8 @@ impl Lexeme { unsafe { core::str::from_utf8_unchecked( core::slice::from_raw_parts( - (&raw const LEXEMES).read().add((self.0) as usize).read(), - (&raw const LEXEME_LENS).read().add((self.0) as usize).read(), + (&raw const LEXEMES).add((self.0) as usize).read(), + (&raw const LEXEME_LENS).add((self.0) as usize).read(), ) ) } @@ -42,10 +42,10 @@ unsafe extern "C" { unsafe fn skip_whitespace() -> (); unsafe fn find_lexeme() -> u8; - static mut LEXEMES: *mut *const u8; - static mut LEXEME_LENS: *mut usize; + static mut LEXEMES: *const u8; + static mut LEXEME_LENS: usize; static mut NUM_LEXEMES: usize; - static mut TOKENS: *mut u32; + static mut TOKENS: u8; static mut input_file: u32; static mut buffer: *mut u8; @@ -55,8 +55,37 @@ unsafe extern "C" { unsafe fn exit(code: i32) -> !; } +// fn lexemes_raw() -> &'static [*const u8] { +// unsafe { +// core::slice::from_raw_parts( +// (&raw const LEXEMES), +// (&raw const NUM_LEXEMES).read(), +// ) +// } +// } + +// fn lexeme_lens() -> &'static [usize] { +// unsafe { +// core::slice::from_raw_parts( +// (&raw const LEXEME_LENS), +// (&raw const NUM_LEXEMES).read(), +// ) +// } +// } + +// fn lexeme_iter() -> impl Iterator { +// lexemes_raw().iter().zip(lexeme_lens().iter()).map(|(&ptr, &len)| { +// // SAFETY: lexemes_raw and lexeme_lens are guaranteed to contain valid +// // UTF-8 data and correct lengths. +// unsafe { +// core::str::from_utf8_unchecked(core::slice::from_raw_parts(ptr, len)) +// } +// }) +// } + fn main() { let path = c"tests/tokens.l"; + unsafe { assert_eq!((&raw const input_file).read(), 0); assert_eq!((&raw const buffer_len).read(), 0); @@ -68,8 +97,9 @@ fn main() { eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read()); tokeniser_print(); - find_lexeme().as_lexeme().map(|lexeme| { + while let Some(lexeme) = find_lexeme().as_lexeme() { eprintln!("Found lexeme: {}", lexeme.lex()); - }); + } + eprintln!("Finished tokenising."); } }