diff --git a/lang/src/tokeniser.asm b/lang/src/tokeniser.asm index 6456008..4341603 100644 --- a/lang/src/tokeniser.asm +++ b/lang/src/tokeniser.asm @@ -313,7 +313,9 @@ NUM_LEXEMES: dq 30 section .text -;; rdi: length of matched lexeme +;; rdi: length of previously matched lexeme +;; returns the length of the ident +;; fn is_ident(lexeme_len: usize) -> usize is_ident: push rbp mov rbp, rsp @@ -351,7 +353,7 @@ is_ident: mov rax, [rel cursor] add rax, r14 mov [rel cursor], rax - mov rax, 1 + mov rax, r14 jmp .epilogue .not_ident: xor rax, rax @@ -396,11 +398,22 @@ skip_whitespaces: pop rbp ret + +;; rdi: pointer to out-struct +;; fn find_lexeme() -> (u8, *const u8, usize) find_lexeme: push rbp mov rbp, rsp + push rdi ; skip whitespaces call skip_whitespaces + ;; init out struct + mov rdi, [rsp] + mov rax, [rel buffer] + add rax, [rel cursor] + mov qword [rdi], 0 + mov [rdi + 8], rax + mov qword [rdi + 16], 0 ; check length mov rax, [rel cursor] mov rcx, [rel buffer_len] @@ -411,6 +424,7 @@ find_lexeme: .eof: ; return TOKEN_EOF; mov rax, TOKEN_EOF + pop rdi pop rbp ret ; } @@ -444,6 +458,7 @@ find_lexeme: test rax, rax je .next ; if is_ident() { + mov rdi, rsi call is_ident test rax, rax ; return TOKEN_IDENT; @@ -454,16 +469,20 @@ find_lexeme: ; return TOKEN_NUMBER; jne .is_number ; } else { + mov rdi, [rsp + 8] mov rax, [rel cursor] ; cursor += len; - lea rdi, [rel LEXEME_LENS] - mov rdi, [rdi + r12*8] - add rax, rdi + lea rsi, [rel LEXEME_LENS] + mov rsi, [rsi + r12*8] + add rax, rsi mov [rel cursor], rax ; return TOKENS[i]; lea rax, [rel TOKENS] mov al, [rax + r12] and rax, 0xFF + mov rdi, [rsp + 8] + mov [rdi], al + mov [rdi + 16], rsi jmp .epilogue ; } .next: @@ -473,6 +492,7 @@ find_lexeme: ; } .not_found: ; if is_ident() { + xor rdi, rdi call is_ident test rax, rax ; return TOKEN_IDENT; @@ -484,15 +504,24 @@ find_lexeme: jne .is_number ; } else { ; return TOKEN_EOF; - mov rax, TOKEN_EOF + mov rdi, [rsp + 8] + mov qword [rdi], TOKEN_EOF ; } .epilogue: pop r12 + pop rdi pop rbp ret .is_ident: - mov rax, TOKEN_IDENT + ; rax = len + ; out.0 = TOKEN_IDENT + ; out.1 = buffer.add(cursor - len) + ; out.2 = len + mov rdi, [rsp + 8] + mov qword [rdi], TOKEN_IDENT + mov [rdi + 16], rax jmp .epilogue .is_number: - mov rax, TOKEN_NUMBER + mov rdi, [rsp + 8] + mov qword [rdi], TOKEN_NUMBER jmp .epilogue diff --git a/lang/tests/tokens.rs b/lang/tests/tokens.rs index 55c4d61..57546d0 100644 --- a/lang/tests/tokens.rs +++ b/lang/tests/tokens.rs @@ -3,21 +3,25 @@ extern "C" fn panic() -> ! { panic!("Called panic from external code."); } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -struct Lexeme(u8); +#[derive(Debug)] +struct Lexeme(u8, &'static str); + +impl PartialEq for Lexeme { + fn eq(&self, other: &Self) -> bool { + match self.0 { + // Identifiers and numbers compare both token and lexeme + 30 | 31 => self.0 == other.0 && self.1 == other.1, + _ => self.0 == other.0, + + } + } +} + +impl Eq for Lexeme {} impl Lexeme { fn lex(&self) -> &'static str { - // SAFETY: lens contains the correct length for each lexeme, and lexemes - // contains pointers to valid 'static UTF-8 data. - unsafe { - core::str::from_utf8_unchecked( - core::slice::from_raw_parts( - (&raw const LEXEMES).add((self.0) as usize).read(), - (&raw const LEXEME_LENS).add((self.0) as usize).read(), - ) - ) - } + self.1 } } @@ -25,10 +29,20 @@ trait AsLexeme { fn as_lexeme(self) -> Option; } -impl AsLexeme for u8 { +#[repr(C)] +struct LexemeRaw { + token: u8, + lexeme: *const u8, + len: usize, +} +impl AsLexeme for LexemeRaw { fn as_lexeme(self) -> Option { - match self { - 1.. => Some(Lexeme(self)), + let Self {token, lexeme, len} = self; + let slice = unsafe {core::str::from_utf8_unchecked( + core::slice::from_raw_parts( + lexeme, len))}; + match token { + 1.. => Some(Lexeme(token, slice)), _ => None, } } @@ -41,7 +55,8 @@ unsafe extern "C" { unsafe fn is_ident(len: usize) -> bool; unsafe fn is_number(len: usize) -> bool; unsafe fn skip_whitespace() -> (); - unsafe fn find_lexeme() -> u8; + + unsafe fn find_lexeme() -> LexemeRaw; static mut LEXEMES: *const u8; static mut LEXEME_LENS: usize; @@ -80,17 +95,17 @@ fn main() { eprintln!("ok."); assert_eq!(&collect_tokens()[..], &[ - Lexeme(4), - Lexeme(1), - Lexeme(2), - Lexeme(3), - Lexeme(4), - Lexeme(8), - Lexeme(13), - Lexeme(11), - Lexeme(10), - Lexeme(9), - Lexeme(5), + Lexeme(4, ""), + Lexeme(1, ""), + Lexeme(2, ""), + Lexeme(3, ""), + Lexeme(4, ""), + Lexeme(8, ""), + Lexeme(13, ""), + Lexeme(11, ""), + Lexeme(10, ""), + Lexeme(9, ""), + Lexeme(5, ""), ][..]); eprint!("Initializing tokeniser.. "); @@ -98,17 +113,17 @@ fn main() { eprintln!("ok."); assert_eq!(&collect_tokens()[..], &[ - Lexeme(19), - Lexeme(18), - Lexeme(28), - Lexeme(29), - Lexeme(21), - Lexeme(20), - Lexeme(24), - Lexeme(12), - Lexeme(23), - Lexeme(22), - Lexeme(15), + Lexeme(19, ""), + Lexeme(18, ""), + Lexeme(28, ""), + Lexeme(29, ""), + Lexeme(21, ""), + Lexeme(20, ""), + Lexeme(24, ""), + Lexeme(12, ""), + Lexeme(23, ""), + Lexeme(22, ""), + Lexeme(15, ""), ][..]); eprintln!("Finished tokenising.");