can tokenise simple tokens

This commit is contained in:
janis 2025-10-28 12:44:08 +01:00
parent 719451b935
commit 63fbca378b
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8
3 changed files with 53 additions and 18 deletions

View file

@ -393,7 +393,7 @@ is_id_start:
;; dil: byte to check ;; dil: byte to check
is_whitespace: is_whitespace:
cmp dil, ' ' cmp dil, 32 ; space
je .is_ws je .is_ws
cmp dil, 9 ; tab cmp dil, 9 ; tab
je .is_ws je .is_ws

View file

@ -112,7 +112,8 @@ global LEXEMES
global TOKENS global TOKENS
global LEXEME_LENS global LEXEME_LENS
global NUM_LEXEMES global NUM_LEXEMES
align 8
LEXEMES: dq \ LEXEMES: dq \
LEX_NOT_A_LEXEME, \ LEX_NOT_A_LEXEME, \
LEX_LET, \ LEX_LET, \
@ -142,6 +143,7 @@ LEXEMES: dq \
LEX_PIPE, \ LEX_PIPE, \
LEX_AMP, \ LEX_AMP, \
LEX_EQEQ LEX_EQEQ
align 8
TOKENS: db \ TOKENS: db \
TOKEN_EOF, \ TOKEN_EOF, \
TOKEN_LET, \ TOKEN_LET, \
@ -171,6 +173,7 @@ TOKENS: db \
TOKEN_PIPE, \ TOKEN_PIPE, \
TOKEN_AMP, \ TOKEN_AMP, \
TOKEN_EQEQ TOKEN_EQEQ
align 8
LEXEME_LENS: dq \ LEXEME_LENS: dq \
0, \ 0, \
LEX_LET_len, \ LEX_LET_len, \
@ -201,7 +204,8 @@ LEXEME_LENS: dq \
LEX_AMP_len, \ LEX_AMP_len, \
LEX_EQEQ_len LEX_EQEQ_len
NUM_LEXEMES equ 28 align 8
NUM_LEXEMES: dq 28
LEX_NOT_A_LEXEME db "<not a lexeme>", 0 LEX_NOT_A_LEXEME db "<not a lexeme>", 0
TOKEN_EOF equ 0 TOKEN_EOF equ 0
@ -353,24 +357,25 @@ skip_whitespaces:
mov rbp, rsp mov rbp, rsp
push r12 push r12
push r13 push r13
push r14
; let start = buffer.add(cursor); ; let start = buffer.add(cursor);
; let end = buffer.add(buffer_len); ; let end = buffer.add(buffer_len);
mov rax, [rel cursor] mov r12, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len] mov r13, [rel buffer_len]
add r13, r12 ; end mov r14, [rel buffer]
add r12, rax ; start
; for ptr in start..end { ; for ptr in start..end {
.loop: .loop:
cmp r12, r13 cmp r12, r13
jge .done jge .done
mov dil, [r12] mov dil, [r14 + r12]
call is_whitespace call is_whitespace
test rax, rax test rax, rax
je .done je .done
inc r12 inc r12
jmp .loop jmp .loop
.done: .done:
mov [rel cursor], r12
pop r14
pop r13 pop r13
pop r12 pop r12
pop rbp pop rbp
@ -399,7 +404,7 @@ find_lexeme:
mov r12, 1 mov r12, 1
; for 1..NUM_LEXEMES { ; for 1..NUM_LEXEMES {
.loop: .loop:
cmp r12, NUM_LEXEMES cmp r12, [rel NUM_LEXEMES]
jge .not_found jge .not_found
; let lexeme = LEXEMES[i]; ; let lexeme = LEXEMES[i];
lea rdi, [rel LEXEMES] lea rdi, [rel LEXEMES]
@ -415,8 +420,8 @@ find_lexeme:
jo .not_found jo .not_found
; if lexeme.len() > buffer.len() - cursor { ; if lexeme.len() > buffer.len() - cursor {
cmp rsi, rcx cmp rsi, rcx
jg .not_found jg .next
; goto .not_found ; continue;
; } ; }
mov rcx, rsi mov rcx, rsi
; if buffer[cursor..cursor+len] == lexeme { ; if buffer[cursor..cursor+len] == lexeme {

View file

@ -12,8 +12,8 @@ impl Lexeme {
unsafe { unsafe {
core::str::from_utf8_unchecked( core::str::from_utf8_unchecked(
core::slice::from_raw_parts( core::slice::from_raw_parts(
(&raw const LEXEMES).read().add((self.0) as usize).read(), (&raw const LEXEMES).add((self.0) as usize).read(),
(&raw const LEXEME_LENS).read().add((self.0) as usize).read(), (&raw const LEXEME_LENS).add((self.0) as usize).read(),
) )
) )
} }
@ -42,10 +42,10 @@ unsafe extern "C" {
unsafe fn skip_whitespace() -> (); unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> u8; unsafe fn find_lexeme() -> u8;
static mut LEXEMES: *mut *const u8; static mut LEXEMES: *const u8;
static mut LEXEME_LENS: *mut usize; static mut LEXEME_LENS: usize;
static mut NUM_LEXEMES: usize; static mut NUM_LEXEMES: usize;
static mut TOKENS: *mut u32; static mut TOKENS: u8;
static mut input_file: u32; static mut input_file: u32;
static mut buffer: *mut u8; static mut buffer: *mut u8;
@ -55,8 +55,37 @@ unsafe extern "C" {
unsafe fn exit(code: i32) -> !; unsafe fn exit(code: i32) -> !;
} }
// fn lexemes_raw() -> &'static [*const u8] {
// unsafe {
// core::slice::from_raw_parts(
// (&raw const LEXEMES),
// (&raw const NUM_LEXEMES).read(),
// )
// }
// }
// fn lexeme_lens() -> &'static [usize] {
// unsafe {
// core::slice::from_raw_parts(
// (&raw const LEXEME_LENS),
// (&raw const NUM_LEXEMES).read(),
// )
// }
// }
// fn lexeme_iter() -> impl Iterator<Item = &'static str> {
// lexemes_raw().iter().zip(lexeme_lens().iter()).map(|(&ptr, &len)| {
// // SAFETY: lexemes_raw and lexeme_lens are guaranteed to contain valid
// // UTF-8 data and correct lengths.
// unsafe {
// core::str::from_utf8_unchecked(core::slice::from_raw_parts(ptr, len))
// }
// })
// }
fn main() { fn main() {
let path = c"tests/tokens.l"; let path = c"tests/tokens.l";
unsafe { unsafe {
assert_eq!((&raw const input_file).read(), 0); assert_eq!((&raw const input_file).read(), 0);
assert_eq!((&raw const buffer_len).read(), 0); assert_eq!((&raw const buffer_len).read(), 0);
@ -68,8 +97,9 @@ fn main() {
eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read()); eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read());
tokeniser_print(); tokeniser_print();
find_lexeme().as_lexeme().map(|lexeme| { while let Some(lexeme) = find_lexeme().as_lexeme() {
eprintln!("Found lexeme: {}", lexeme.lex()); eprintln!("Found lexeme: {}", lexeme.lex());
}); }
eprintln!("Finished tokenising.");
} }
} }