return lexeme str slice from find_lexeme function

This commit is contained in:
janis 2025-10-28 16:30:15 +01:00
parent b495114937
commit 1886231dcd
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8
2 changed files with 90 additions and 46 deletions

View file

@ -313,7 +313,9 @@ NUM_LEXEMES: dq 30
section .text section .text
;; rdi: length of matched lexeme ;; rdi: length of previously matched lexeme
;; returns the length of the ident
;; fn is_ident(lexeme_len: usize) -> usize
is_ident: is_ident:
push rbp push rbp
mov rbp, rsp mov rbp, rsp
@ -351,7 +353,7 @@ is_ident:
mov rax, [rel cursor] mov rax, [rel cursor]
add rax, r14 add rax, r14
mov [rel cursor], rax mov [rel cursor], rax
mov rax, 1 mov rax, r14
jmp .epilogue jmp .epilogue
.not_ident: .not_ident:
xor rax, rax xor rax, rax
@ -396,11 +398,22 @@ skip_whitespaces:
pop rbp pop rbp
ret ret
;; rdi: pointer to out-struct
;; fn find_lexeme() -> (u8, *const u8, usize)
find_lexeme: find_lexeme:
push rbp push rbp
mov rbp, rsp mov rbp, rsp
push rdi
; skip whitespaces ; skip whitespaces
call skip_whitespaces call skip_whitespaces
;; init out struct
mov rdi, [rsp]
mov rax, [rel buffer]
add rax, [rel cursor]
mov qword [rdi], 0
mov [rdi + 8], rax
mov qword [rdi + 16], 0
; check length ; check length
mov rax, [rel cursor] mov rax, [rel cursor]
mov rcx, [rel buffer_len] mov rcx, [rel buffer_len]
@ -411,6 +424,7 @@ find_lexeme:
.eof: .eof:
; return TOKEN_EOF; ; return TOKEN_EOF;
mov rax, TOKEN_EOF mov rax, TOKEN_EOF
pop rdi
pop rbp pop rbp
ret ret
; } ; }
@ -444,6 +458,7 @@ find_lexeme:
test rax, rax test rax, rax
je .next je .next
; if is_ident() { ; if is_ident() {
mov rdi, rsi
call is_ident call is_ident
test rax, rax test rax, rax
; return TOKEN_IDENT; ; return TOKEN_IDENT;
@ -454,16 +469,20 @@ find_lexeme:
; return TOKEN_NUMBER; ; return TOKEN_NUMBER;
jne .is_number jne .is_number
; } else { ; } else {
mov rdi, [rsp + 8]
mov rax, [rel cursor] mov rax, [rel cursor]
; cursor += len; ; cursor += len;
lea rdi, [rel LEXEME_LENS] lea rsi, [rel LEXEME_LENS]
mov rdi, [rdi + r12*8] mov rsi, [rsi + r12*8]
add rax, rdi add rax, rsi
mov [rel cursor], rax mov [rel cursor], rax
; return TOKENS[i]; ; return TOKENS[i];
lea rax, [rel TOKENS] lea rax, [rel TOKENS]
mov al, [rax + r12] mov al, [rax + r12]
and rax, 0xFF and rax, 0xFF
mov rdi, [rsp + 8]
mov [rdi], al
mov [rdi + 16], rsi
jmp .epilogue jmp .epilogue
; } ; }
.next: .next:
@ -473,6 +492,7 @@ find_lexeme:
; } ; }
.not_found: .not_found:
; if is_ident() { ; if is_ident() {
xor rdi, rdi
call is_ident call is_ident
test rax, rax test rax, rax
; return TOKEN_IDENT; ; return TOKEN_IDENT;
@ -484,15 +504,24 @@ find_lexeme:
jne .is_number jne .is_number
; } else { ; } else {
; return TOKEN_EOF; ; return TOKEN_EOF;
mov rax, TOKEN_EOF mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_EOF
; } ; }
.epilogue: .epilogue:
pop r12 pop r12
pop rdi
pop rbp pop rbp
ret ret
.is_ident: .is_ident:
mov rax, TOKEN_IDENT ; rax = len
; out.0 = TOKEN_IDENT
; out.1 = buffer.add(cursor - len)
; out.2 = len
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_IDENT
mov [rdi + 16], rax
jmp .epilogue jmp .epilogue
.is_number: .is_number:
mov rax, TOKEN_NUMBER mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_NUMBER
jmp .epilogue jmp .epilogue

View file

@ -3,21 +3,25 @@ extern "C" fn panic() -> ! {
panic!("Called panic from external code."); panic!("Called panic from external code.");
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug)]
struct Lexeme(u8); struct Lexeme(u8, &'static str);
impl PartialEq for Lexeme {
fn eq(&self, other: &Self) -> bool {
match self.0 {
// Identifiers and numbers compare both token and lexeme
30 | 31 => self.0 == other.0 && self.1 == other.1,
_ => self.0 == other.0,
}
}
}
impl Eq for Lexeme {}
impl Lexeme { impl Lexeme {
fn lex(&self) -> &'static str { fn lex(&self) -> &'static str {
// SAFETY: lens contains the correct length for each lexeme, and lexemes self.1
// contains pointers to valid 'static UTF-8 data.
unsafe {
core::str::from_utf8_unchecked(
core::slice::from_raw_parts(
(&raw const LEXEMES).add((self.0) as usize).read(),
(&raw const LEXEME_LENS).add((self.0) as usize).read(),
)
)
}
} }
} }
@ -25,10 +29,20 @@ trait AsLexeme {
fn as_lexeme(self) -> Option<Lexeme>; fn as_lexeme(self) -> Option<Lexeme>;
} }
impl AsLexeme for u8 { #[repr(C)]
struct LexemeRaw {
token: u8,
lexeme: *const u8,
len: usize,
}
impl AsLexeme for LexemeRaw {
fn as_lexeme(self) -> Option<Lexeme> { fn as_lexeme(self) -> Option<Lexeme> {
match self { let Self {token, lexeme, len} = self;
1.. => Some(Lexeme(self)), let slice = unsafe {core::str::from_utf8_unchecked(
core::slice::from_raw_parts(
lexeme, len))};
match token {
1.. => Some(Lexeme(token, slice)),
_ => None, _ => None,
} }
} }
@ -41,7 +55,8 @@ unsafe extern "C" {
unsafe fn is_ident(len: usize) -> bool; unsafe fn is_ident(len: usize) -> bool;
unsafe fn is_number(len: usize) -> bool; unsafe fn is_number(len: usize) -> bool;
unsafe fn skip_whitespace() -> (); unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> u8;
unsafe fn find_lexeme() -> LexemeRaw;
static mut LEXEMES: *const u8; static mut LEXEMES: *const u8;
static mut LEXEME_LENS: usize; static mut LEXEME_LENS: usize;
@ -80,17 +95,17 @@ fn main() {
eprintln!("ok."); eprintln!("ok.");
assert_eq!(&collect_tokens()[..], &[ assert_eq!(&collect_tokens()[..], &[
Lexeme(4), Lexeme(4, ""),
Lexeme(1), Lexeme(1, ""),
Lexeme(2), Lexeme(2, ""),
Lexeme(3), Lexeme(3, ""),
Lexeme(4), Lexeme(4, ""),
Lexeme(8), Lexeme(8, ""),
Lexeme(13), Lexeme(13, ""),
Lexeme(11), Lexeme(11, ""),
Lexeme(10), Lexeme(10, ""),
Lexeme(9), Lexeme(9, ""),
Lexeme(5), Lexeme(5, ""),
][..]); ][..]);
eprint!("Initializing tokeniser.. "); eprint!("Initializing tokeniser.. ");
@ -98,17 +113,17 @@ fn main() {
eprintln!("ok."); eprintln!("ok.");
assert_eq!(&collect_tokens()[..], &[ assert_eq!(&collect_tokens()[..], &[
Lexeme(19), Lexeme(19, ""),
Lexeme(18), Lexeme(18, ""),
Lexeme(28), Lexeme(28, ""),
Lexeme(29), Lexeme(29, ""),
Lexeme(21), Lexeme(21, ""),
Lexeme(20), Lexeme(20, ""),
Lexeme(24), Lexeme(24, ""),
Lexeme(12), Lexeme(12, ""),
Lexeme(23), Lexeme(23, ""),
Lexeme(22), Lexeme(22, ""),
Lexeme(15), Lexeme(15, ""),
][..]); ][..]);
eprintln!("Finished tokenising."); eprintln!("Finished tokenising.");