can tokenise stuff

This commit is contained in:
janis 2025-10-17 18:22:27 +02:00
parent 90061bf50a
commit 8a0c822533
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8
2 changed files with 353 additions and 9 deletions

View file

@ -13,6 +13,8 @@ section .text
global oom
global panic
global strlen
global strcmp
global streq
global memcpy
global eprint_str
global exit
@ -20,6 +22,11 @@ global error_to_str
global eprint_error
global alloc_pages
global is_alpha
global is_numeric
global is_id_continue
global is_id_start
;; ==============================
;; Helper functions
;; ==============================
@ -73,6 +80,96 @@ strlen:
.strlen_done:
ret
;; Checks two byte slices for equality
;; rcx: pointer to first slice
;; rdx: length of first slice
;; r8: pointer to second slice
;; r9: length of second slice
;; returns: 1 if equal, 0 if not equal
streq:
cmp rdx, r9
jne .not_equal ; lengths differ
xor r10, r10 ; index = 0
.loop:
cmp r10, rdx
jge .equal ; done all bytes
mov al, [rcx + r10] ; char from a
mov bl, [r8 + r10] ; char from b
cmp al, bl
jne .not_equal ; chars differ
inc r10
jmp .loop
.equal:
mov rax, 1 ; equal
ret
.not_equal:
xor rax, rax ; not equal
ret
;; Compares two byte slices
;; rcx: pointer to first slice
;; rdx: length of first slice
;; r8: pointer to second slice
;; r9: length of second slice
; returns: -1, 0, or 1 in rax
strcmp:
xor rax, rax ; result = 0 (assume equal)
test rdx, rdx
jz .check_empty_b ; if len a == 0
test r9, r9
jz .check_empty_a ; if len b == 0
mov r10, rdx ; min(len a, len b) in r10
cmp r9, rdx
cmovb r10, r9
xor r11, r11 ; index = 0
.loop:
cmp r11, r10
jge .after_loop ; done min(len a, len b) bytes
mov al, [rcx + r11] ; char from a
mov bl, [r8 + r11] ; char from b
cmp al, bl
jb .less ; if al < bl: return -1
ja .greater ; if al > bl: return 1
inc r11
jmp .loop
.after_loop:
cmp rdx, r9
je .equal ; lengths equal, strings equal
jb .less ; a shorter than b
ja .greater ; a longer than b
.equal:
xor rax, rax ; 0
ret
.less:
mov rax, -1
ret
.greater:
mov rax, 1
ret
.check_empty_a:
test rdx, rdx
jz .equal ; both empty
jmp .greater ; a not empty, b empty
.check_empty_b:
test r9, r9
jz .equal ; both empty
jmp .less ; b not empty, a empty
;; Copy bytes from one memory location to another
;; rcx: destination pointer
;; rdx: source pointer
@ -193,3 +290,62 @@ alloc_pages:
mov rcx, rax ; error code
call eprint_error
call oom
;; Returns 1 if cl is an ASCII alphabetic character, 0 otherwise
;; cl: byte to check
is_alpha:
cmp cl, 'A'
jb .not_alpha
cmp cl, 'Z'
jbe .is_alpha_ret
cmp cl, 'a'
jb .not_alpha
cmp cl, 'z'
jbe .is_alpha_ret
.is_alpha_ret:
mov rax, 1
ret
.not_alpha:
xor rax, rax
ret
is_numeric:
cmp cl, '0'
jb .not_numeric
cmp cl, '9'
jbe .is_numeric_ret
.is_numeric_ret:
mov rax, 1
ret
.not_numeric:
xor rax, rax
ret
;; cl: byte to check
is_id_continue:
call is_alpha
cmp rax, 1
je .is_id_continue_ret
call is_numeric
cmp rax, 1
je .is_id_continue_ret
cmp cl, '_'
je .is_id_continue_ret
xor rax, rax
ret
.is_id_continue_ret:
mov rax, 1
ret
;; cl: byte to check
is_id_start:
call is_alpha
cmp rax, 1
je .is_ret
cmp cl, '_'
je .is_ret
xor rax, rax
ret
.is_ret:
mov rax, 1
ret

View file

@ -4,6 +4,8 @@ extern int_to_str
extern oom
extern panic
extern strlen
extern strcmp
extern streq
extern memcpy
extern eprint_str
extern exit
@ -11,6 +13,11 @@ extern error_to_str
extern eprint_error
extern alloc_pages
extern is_alpha
extern is_numeric
extern is_id_continue
extern is_id_start
section .data
hello_msg db "Hello, World!", 10
hello_msg_len equ $ - hello_msg
@ -19,14 +26,6 @@ section .data
section .text
global _start
test_fn:
push r11
push r10
pop r10
pop r11
ret
_start:
; get filename from argv[1]
; argv is at rsp + 8
@ -44,7 +43,14 @@ _start:
mov rdx, rax ; rdx = pointer to filename
call tokeniser_init
call tokeniser_print
jmp .exit
.loop:
call tokeniser_next_token
cmp rax, 0
je .exit
mov rcx, [LEXEMES + rax*8]
mov rdx, [LEXEME_LENS + rax*8]
call eprint_str
jmp .loop
.no_filename:
call panic
@ -186,3 +192,185 @@ tokeniser_print:
mov rcx, [buffer + rax]
mov rdx, [buffer_len]
call eprint_str
section .rdata
LEXEMES: dq \
LEX_NOT_A_LEXEME, \
LEX_LET, \
LEX_IF, \
LEX_ELSE, \
LEX_FN, \
LEX_ARROW
TOKENS: db \
TOKEN_EOF, \
TOKEN_LET, \
TOKEN_IF, \
TOKEN_ELSE, \
TOKEN_FN, \
TOKEN_ARROW
LEXEME_LENS: dq \
0, \
LEX_LET_len, \
LEX_IF_len, \
LEX_ELSE_len, \
LEX_FN_len, \
LEX_ARROW_len
NUM_LEXEMES equ 5
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
TOKEN_EOF equ 0
TOKEN_LET equ 1
LEX_LET db "let"
LEX_LET_len equ $ - LEX_LET
TOKEN_IF equ 2
LEX_IF db "if"
LEX_IF_len equ $ - LEX_IF
TOKEN_ELSE equ 3
LEX_ELSE db "else"
LEX_ELSE_len equ $ - LEX_ELSE
TOKEN_FN equ 4
LEX_FN db "fn"
LEX_FN_len equ $ - LEX_FN
TOKEN_RETURN equ 5
TOKEN_LOOP equ 6
TOKEN_BREAK equ 7
TOKEN_CONTINUE equ 8
TOKEN_TRUE equ 9
TOKEN_FALSE equ 10
TOKEN_BOOL equ 11
TOKEN_ARROW equ 12
LEX_ARROW db "->"
LEX_ARROW_len equ $ - LEX_ARROW
TOKEN_I32 equ 13
TOKEN_U32 equ 14
TOKEN_EQUALS equ 15
TOKEN_PLUS equ 16
TOKEN_MINUS equ 17
TOKEN_RPARENS equ 18
TOKEN_LPARENS equ 19
TOKEN_RBRACE equ 20
TOKEN_LBRACE equ 21
TOKEN_COLON equ 22
TOKEN_SEMI equ 23
TOKEN_COMMA equ 24
TOKEN_PIPE equ 25
TOKEN_AMP equ 26
TOKEN_IDENT equ 27
TOKEN_NUMBER equ 28
section .text
;; rcx: pointer to lexeme
;; rdx: length of lexeme
;; r8: token to return if matched
;; Returns:
;; rax: token if matched, 0 if not matched
try_lexeme:
push r8
push r15
; compare lexeme with cursor
mov r9, [cursor]
mov r8, [buffer]
add r8, r9
mov rax, [buffer_len]
sub rax, r9
cmp rax, rdx
jl .not_equal
; compare memory
mov r9, rdx
call streq
cmp rax, 1
jne .not_equal
; check if it could be an ident:
; the buffer must not have ended
; the next char must be id_continue
; first char must be id_start
mov rax, [cursor]
add rax, rdx
cmp rax, [buffer_len]
jge .not_ident
mov cl, [buffer + rax]
call is_id_continue
cmp rax, 1
jne .not_ident
; check first char
mov rax, [cursor]
mov cl, [buffer + rax]
call is_id_start
cmp rax, 1
jne .not_ident
; this is an ident
; move cursor forward while is_id_continue
mov r15, [cursor]
add r15, rdx
.try_lexeme_loop:
cmp r15, [buffer_len]
jge .done_ident
mov cl, [buffer + r15]
call is_id_continue
cmp rax, 1
jne .done_ident
inc r15
jmp .try_lexeme_loop
.done_ident:
mov [cursor], r15
pop r15
pop r8
mov rax, TOKEN_IDENT
ret
.not_ident:
mov rax, [cursor]
add rax, rdx
mov [cursor], rax
pop r15
pop r8
mov rax, r8
ret
.not_equal:
pop r15
pop r8
xor rax, rax
ret
;; Returns: token enumerator
tokeniser_next_token:
; check if at end of buffer
.loop:
mov rax, [cursor]
cmp rax, [buffer_len]
jge .eof
; get next 4 bytes
mov rbx, [buffer]
add rbx, rax
mov bl, byte [rbx]
; skip whitespace
cmp bl, ' '
je .skip
mov r15, 1 ; lexeme index
.inner_loop:
cmp r15, NUM_LEXEMES
; TODO: numbers, idents
jge .skip
; try lexeme
mov rcx, [LEXEMES + r15*8]
mov rdx, [LEXEME_LENS + r15*8]
mov r8, [TOKENS + r15]
call try_lexeme
cmp rax, 0
jne .return_token
inc r15
jmp .inner_loop
.skip:
mov rax, [cursor]
inc rax
mov [cursor], rax
jmp .loop
.eof:
mov rax, TOKEN_EOF
ret
.return_token:
mov rax, r15
ret