rework tokeniser

This commit is contained in:
janis 2025-10-21 00:36:27 +02:00
parent d47c933c04
commit f1faac639c
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8
2 changed files with 234 additions and 42 deletions

View file

@ -389,3 +389,20 @@ is_id_start:
.is_ret: .is_ret:
mov rax, 1 mov rax, 1
ret ret
;; dil: byte to check
is_whitespace:
cmp dil, ' '
je .is_ws
cmp dil, 9 ; tab
je .is_ws
cmp dil, 10 ; newline
je .is_ws
cmp dil, 13 ; carriage return
je .is_ws
xor rax, rax
ret
.is_ws:
mov rax, 1
ret

View file

@ -73,10 +73,9 @@ compiler_entry:
;; Opens file for reading: ;; Opens file for reading:
;; rdx: pointer to filename (null-terminated) ;; rdi: pointer to filename (null-terminated)
fopen_read: fopen_read:
mov rax, 2 ; syscall: open mov rax, 2 ; syscall: open
mov rdi, rdx ; filename
mov rsi, 0 ; flags: O_RDONLY mov rsi, 0 ; flags: O_RDONLY
mov rdx, 0 ; mode mov rdx, 0 ; mode
syscall syscall
@ -86,46 +85,25 @@ fopen_read:
.file_error: .file_error:
push rdi push rdi
mov rcx, rax mov rdi, rax
call eprint_error call eprint_error
pop rdi
mov rcx, rdi ; filename is in rdi
call strlen ; get length of filename
mov r9, rax ; r9 = filename length
mov rsi, r9 lea rdi, [rel file_error_msg]
add rsi, file_error_msg_len ; + prefix lea rsi, [rel file_error_msg_len]
add rsi, 1 ; + newline
add rsi, 15
and rsi, -16 ; align up to 16
sub rsp, rsi ; allocate buffer
push rsi ; save allocation size
; copy file_error_msg
lea rcx, [rsp + 8]
mov rdx, file_error_msg
mov r8, file_error_msg_len
call memcpy
; copy filename
lea rcx, [rsp + 8 + file_error_msg_len]
mov rdx, rdi
mov r8, r9
call memcpy
; trailing newline
lea rdx, [rsp + 8 + file_error_msg_len + r9]
mov byte [rdx], 10
; print error message
lea rcx, [rsp + 8]
mov rdx, file_error_msg_len
add rdx, r9
add rdx, 1 ; include newline
call eprint_str call eprint_str
pop rsi pop rdi
add rsp, rsi ; dealloc call strlen ; get length of filename
mov rsi, rax ; r9 = filename length
call eprint_str
mov rdi, 10
push rdi
mov rdi, rsp
mov rsi, 1
call eprint_str
pop rdi
call panic call panic
;; ============================= ;; =============================
@ -212,23 +190,89 @@ LEXEMES: dq \
LEX_IF, \ LEX_IF, \
LEX_ELSE, \ LEX_ELSE, \
LEX_FN, \ LEX_FN, \
LEX_ARROW LEX_RETURN, \
LEX_LOOP, \
LEX_BREAK, \
LEX_CONTINUE, \
LEX_TRUE, \
LEX_FALSE, \
LEX_BOOL, \
LEX_ARROW, \
LEX_I32, \
LEX_U32, \
LEX_EQUALS, \
LEX_PLUS, \
LEX_MINUS, \
LEX_RPARENS, \
LEX_LPARENS, \
LEX_RBRACE, \
LEX_LBRACE, \
LEX_COLON, \
LEX_SEMI, \
LEX_COMMA, \
LEX_PIPE, \
LEX_AMP, \
LEX_EQEQ
TOKENS: db \ TOKENS: db \
TOKEN_EOF, \ TOKEN_EOF, \
TOKEN_LET, \ TOKEN_LET, \
TOKEN_IF, \ TOKEN_IF, \
TOKEN_ELSE, \ TOKEN_ELSE, \
TOKEN_FN, \ TOKEN_FN, \
TOKEN_ARROW TOKEN_RETURN, \
TOKEN_LOOP, \
TOKEN_BREAK, \
TOKEN_CONTINUE, \
TOKEN_TRUE, \
TOKEN_FALSE, \
TOKEN_BOOL, \
TOKEN_ARROW, \
TOKEN_I32, \
TOKEN_U32, \
TOKEN_EQUALS, \
TOKEN_PLUS, \
TOKEN_MINUS, \
TOKEN_RPARENS, \
TOKEN_LPARENS, \
TOKEN_RBRACE, \
TOKEN_LBRACE, \
TOKEN_COLON, \
TOKEN_SEMI, \
TOKEN_COMMA, \
TOKEN_PIPE, \
TOKEN_AMP, \
TOKEN_EQEQ
LEXEME_LENS: dq \ LEXEME_LENS: dq \
0, \ 0, \
LEX_LET_len, \ LEX_LET_len, \
LEX_IF_len, \ LEX_IF_len, \
LEX_ELSE_len, \ LEX_ELSE_len, \
LEX_FN_len, \ LEX_FN_len, \
LEX_ARROW_len LEX_RETURN_len, \
LEX_LOOP_len, \
LEX_BREAK_len, \
LEX_CONTINUE_len, \
LEX_TRUE_len, \
LEX_FALSE_len, \
LEX_BOOL_len, \
LEX_ARROW_len, \
LEX_I32_len, \
LEX_U32_len, \
LEX_EQUALS_len, \
LEX_PLUS_len, \
LEX_MINUS_len, \
LEX_RPARENS_len, \
LEX_LPARENS_len, \
LEX_RBRACE_len, \
LEX_LBRACE_len, \
LEX_COLON_len, \
LEX_SEMI_len, \
LEX_COMMA_len, \
LEX_PIPE_len, \
LEX_AMP_len, \
LEX_EQEQ_len
NUM_LEXEMES equ 5 NUM_LEXEMES equ 28
LEX_NOT_A_LEXEME db "<not a lexeme>", 0 LEX_NOT_A_LEXEME db "<not a lexeme>", 0
TOKEN_EOF equ 0 TOKEN_EOF equ 0
@ -321,6 +365,137 @@ LEXEME_LENS: dq \
LEX_NUMBER_len equ $ - LEX_NUMBER LEX_NUMBER_len equ $ - LEX_NUMBER
section .text section .text
;; rdi: length of matched lexeme
is_ident:
push rbp
mov rbp, rsp
push r12
push r13
push r14
push rdi
lea rax, [rel cursor]
lea r12, [rel buffer]
lea r13, [rel buffer_len]
sub r13, rax
add r12, rax
; check first char is id_start
mov dil, [r12]
call is_id_start
test rax, rax
je .not_ident
xor r14, r14
.loop:
cmp r14, r13
jge .done
mov dil, [r12 + r14]
; check for id_continue
call is_id_continue
test rax, rax
je .done
inc r14
jmp .loop
.done:
; r14 is length of ident
mov rdi, [rsp]
cmp r14, rdi
jle .not_ident
lea rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, 1
jmp .epilogue
.not_ident:
xor rax, rax
.epilogue:
pop rdi
pop r14
pop r13
pop r12
pop rbp
ret
is_number:
xor rax, rax
ret
find_lexeme:
push rbp
mov rbp, rsp
push r12
xor r12, r12
; for i..NUM_LEXEMES {
.loop:
cmp r12, NUM_LEXEMES
jge .not_found
; let lexeme = LEXEMES[i];
lea rdi, [rel LEXEMES + r12*8]
lea rax, [rel cursor]
lea rsi, [rel buffer]
add rsi, rax
; let len = LEXEME_LENS[i];
lea rdx, [LEXEME_LENS + r12*8]
lea rcx, [rel buffer_len]
sub rcx, rdx
sub rcx, rax
jo .next
; if buffer[cursor..cursor+len] == lexeme {
call streq
test rax, rax
jne .next
; if is_ident() {
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else {
lea rax, [rel cursor]
; cursor += len;
lea rdi, [rel LEXEME_LENS + r12*8]
add rax, rdi
mov [rel cursor], rax
; return TOKENS[i];
lea rax, [rel TOKENS + r12*4]
jmp .epilogue
; }
.next:
inc r12
jmp .loop
; }
; }
.not_found:
; if is_ident() {
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else {
; return TOKEN_EOF;
mov rax, TOKEN_EOF
; }
.epilogue:
pop r12
pop rbp
ret
.is_ident:
mov rax, TOKEN_IDENT
jmp .epilogue
.is_number:
mov rax, TOKEN_NUMBER
jmp .epilogue
;; rcx: lexeme index ;; rcx: lexeme index
;; Returns: ;; Returns:
;; rax: token if matched, 0 if not matched ;; rax: token if matched, 0 if not matched