can tokenise simple tokens

This commit is contained in:
janis 2025-10-28 12:44:08 +01:00
parent 719451b935
commit 63fbca378b
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8
3 changed files with 53 additions and 18 deletions

View file

@ -393,7 +393,7 @@ is_id_start:
;; dil: byte to check
is_whitespace:
cmp dil, ' '
cmp dil, 32 ; space
je .is_ws
cmp dil, 9 ; tab
je .is_ws

View file

@ -112,7 +112,8 @@ global LEXEMES
global TOKENS
global LEXEME_LENS
global NUM_LEXEMES
align 8
LEXEMES: dq \
LEX_NOT_A_LEXEME, \
LEX_LET, \
@ -142,6 +143,7 @@ LEXEMES: dq \
LEX_PIPE, \
LEX_AMP, \
LEX_EQEQ
align 8
TOKENS: db \
TOKEN_EOF, \
TOKEN_LET, \
@ -171,6 +173,7 @@ TOKENS: db \
TOKEN_PIPE, \
TOKEN_AMP, \
TOKEN_EQEQ
align 8
LEXEME_LENS: dq \
0, \
LEX_LET_len, \
@ -201,7 +204,8 @@ LEXEME_LENS: dq \
LEX_AMP_len, \
LEX_EQEQ_len
NUM_LEXEMES equ 28
align 8
NUM_LEXEMES: dq 28
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
TOKEN_EOF equ 0
@ -353,24 +357,25 @@ skip_whitespaces:
mov rbp, rsp
push r12
push r13
push r14
; let start = buffer.add(cursor);
; let end = buffer.add(buffer_len);
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r12, [rel cursor]
mov r13, [rel buffer_len]
add r13, r12 ; end
add r12, rax ; start
mov r14, [rel buffer]
; for ptr in start..end {
.loop:
cmp r12, r13
jge .done
mov dil, [r12]
mov dil, [r14 + r12]
call is_whitespace
test rax, rax
je .done
inc r12
jmp .loop
.done:
mov [rel cursor], r12
pop r14
pop r13
pop r12
pop rbp
@ -399,7 +404,7 @@ find_lexeme:
mov r12, 1
; for 1..NUM_LEXEMES {
.loop:
cmp r12, NUM_LEXEMES
cmp r12, [rel NUM_LEXEMES]
jge .not_found
; let lexeme = LEXEMES[i];
lea rdi, [rel LEXEMES]
@ -415,8 +420,8 @@ find_lexeme:
jo .not_found
; if lexeme.len() > buffer.len() - cursor {
cmp rsi, rcx
jg .not_found
; goto .not_found
jg .next
; continue;
; }
mov rcx, rsi
; if buffer[cursor..cursor+len] == lexeme {

View file

@ -12,8 +12,8 @@ impl Lexeme {
unsafe {
core::str::from_utf8_unchecked(
core::slice::from_raw_parts(
(&raw const LEXEMES).read().add((self.0) as usize).read(),
(&raw const LEXEME_LENS).read().add((self.0) as usize).read(),
(&raw const LEXEMES).add((self.0) as usize).read(),
(&raw const LEXEME_LENS).add((self.0) as usize).read(),
)
)
}
@ -42,10 +42,10 @@ unsafe extern "C" {
unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> u8;
static mut LEXEMES: *mut *const u8;
static mut LEXEME_LENS: *mut usize;
static mut LEXEMES: *const u8;
static mut LEXEME_LENS: usize;
static mut NUM_LEXEMES: usize;
static mut TOKENS: *mut u32;
static mut TOKENS: u8;
static mut input_file: u32;
static mut buffer: *mut u8;
@ -55,8 +55,37 @@ unsafe extern "C" {
unsafe fn exit(code: i32) -> !;
}
// fn lexemes_raw() -> &'static [*const u8] {
// unsafe {
// core::slice::from_raw_parts(
// (&raw const LEXEMES),
// (&raw const NUM_LEXEMES).read(),
// )
// }
// }
// fn lexeme_lens() -> &'static [usize] {
// unsafe {
// core::slice::from_raw_parts(
// (&raw const LEXEME_LENS),
// (&raw const NUM_LEXEMES).read(),
// )
// }
// }
// fn lexeme_iter() -> impl Iterator<Item = &'static str> {
// lexemes_raw().iter().zip(lexeme_lens().iter()).map(|(&ptr, &len)| {
// // SAFETY: lexemes_raw and lexeme_lens are guaranteed to contain valid
// // UTF-8 data and correct lengths.
// unsafe {
// core::str::from_utf8_unchecked(core::slice::from_raw_parts(ptr, len))
// }
// })
// }
fn main() {
let path = c"tests/tokens.l";
unsafe {
assert_eq!((&raw const input_file).read(), 0);
assert_eq!((&raw const buffer_len).read(), 0);
@ -68,8 +97,9 @@ fn main() {
eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read());
tokeniser_print();
find_lexeme().as_lexeme().map(|lexeme| {
while let Some(lexeme) = find_lexeme().as_lexeme() {
eprintln!("Found lexeme: {}", lexeme.lex());
});
}
eprintln!("Finished tokenising.");
}
}