from-scratch/lang/src/tokeniser.asm

597 lines
11 KiB
NASM

section .text
extern panic
extern strlen
extern strcmp
extern streq
extern memcpy
extern eprint_str
extern exit
extern error_to_str
extern eprint_error
extern alloc_pages
extern allocate
extern fopen_read
extern is_alpha
extern is_numeric
extern is_id_continue
extern is_id_start
extern is_whitespace
global tokeniser_init
global tokeniser_init_buf
global tokeniser_print
global find_lexeme
global expect_token
global unwrap_token
global peek_expect_token
global peek_lexeme
global tokeniser_get_cursor
global tokeniser_set_cursor
;; =============================
;; Tokeniser functions
;; =============================
;; tokeniser state
section .data
global input_file
global buffer
global cursor
global buffer_len
input_file dd 0
buffer dq 0
cursor dq 0
buffer_len dq 0
;; each buffer is chunk_size bytes large
;; buffer header structure:
;; +0 (8 bytes): pointer buffer
;; +8 (8 bytes): size of buffer
;; Tokens:
;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ]
section .bss
statbuf: resb 144
section .text
;; rdi: pointer to buffer
;; rsi: length of buffer
tokeniser_init_buf:
push rbp
mov rbp, rsp
mov dword [rel input_file], 0
mov qword [rel buffer], rdi
mov qword [rel buffer_len], rsi
mov qword [rel cursor], 0
pop rbp
ret
;; Initialises the tokeniser
;; rdx: pointer to filename (null-terminated)
tokeniser_init:
; open file for reading
; this panics if the file doesn't exist
call fopen_read
mov dword [rel input_file], eax ; store file descriptor
mov qword [rel cursor], 0
mov qword [rel buffer_len], 0
; fstat
mov rax, 5 ; syscall: fstat
mov rdi, [rel input_file] ; fd
lea rsi, [rel statbuf] ; statbuf
syscall
cmp rax, 0
jl .report_error
; get file size from statbuf
lea r15, [rel statbuf] ; st_size
mov r15, [r15 + 48] ; offset of st_size in stat struct
; allocate buffer
mov rdi, r15
call allocate
mov qword [rel buffer], rax
mov qword [rel buffer_len], r15
; read file into buffer
mov rax, 0 ; syscall: read
mov edi, [rel input_file] ; fd
mov rsi, [rel buffer] ; buf
mov rdx, [rel buffer_len] ; count
syscall
cmp rax, 0
jl .report_error
ret
.report_error:
mov rcx, rax
call eprint_error
call panic
section .rdata
tokeniser_buffer db "Tokeniser buffer: ", 10
tokeniser_buffer_len equ $ - tokeniser_buffer
section .text
tokeniser_print:
lea rdi, [rel tokeniser_buffer]
mov rsi, tokeniser_buffer_len
call eprint_str
mov rax, [rel cursor]
mov rdi, [rel buffer]
add rdi, rax
mov rsi, [rel buffer_len]
call eprint_str
ret
section .rdata
global LEXEMES
global TOKENS
global LEXEME_LENS
global NUM_LEXEMES
%include "src/tokeniser.inc"
section .text
;; rdi: length of previously matched lexeme
;; returns the length of the ident
;; fn is_ident(lexeme_len: usize) -> usize
is_ident:
push rbp
mov rbp, rsp
push r12
push r13
push r14
push rdi
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
sub r13, rax
add r12, rax
; check first char is id_start
mov dil, [r12]
call is_id_start
test rax, rax
je .not_ident
mov r14, 1
.loop:
cmp r14, r13
jge .done
mov dil, [r12 + r14]
; check for id_continue
call is_id_continue
test rax, rax
je .done
inc r14
jmp .loop
.done:
; r14 is length of ident
mov rdi, [rsp]
cmp r14, rdi
jle .not_ident
mov rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, r14
jmp .epilogue
.not_ident:
xor rax, rax
.epilogue:
pop rdi
pop r14
pop r13
pop r12
pop rbp
ret
is_comment:
push rbp
mov rbp, rsp
push r12
push r13
push r14
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
add r12, rax
sub r13, rax
mov dil, [r12]
cmp dil, '/'
jne .not_comment
mov r14, 1
cmp r14, r13
jge .not_comment
mov dil, [r12 + r14]
cmp dil, '/'
jne .not_comment
.loop:
inc r14
cmp r14, r13
jge .comment
mov dil, [r12 + r14]
cmp dil, 10 ; newline
jne .loop
.comment:
mov rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, r14
jmp .epilogue
.not_comment:
xor rax, rax
.epilogue:
pop r14
pop r13
pop r12
pop rbp
ret
;; Strings are sequences of characters enclosed in double quotes
;; Strings span multiple lines, and may in the future contain escape sequences
is_string:
push rbp
mov rbp, rsp
push r12
push r13
push r14
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
add r12, rax
sub r13, rax
mov dil, [r12]
cmp dil, '"'
jne .not_string
mov r14, 1
.loop:
cmp r14, r13
jge .unterminated
mov dil, [r12 + r14]
cmp dil, '"'
je .string
cmp dil, 0x5c ; backslash
je .escape
inc r14
jmp .loop
.escape:
inc r14
cmp r14, r13
jge .unterminated
inc r14
jmp .loop
.string:
mov rax, [rel cursor]
inc r14 ; include closing quote
add rax, r14
mov [rel cursor], rax
mov rax, r14
jmp .epilogue
.unterminated:
;; TODO: report unterminated string error
mov rax, r14
jmp .epilogue
.not_string:
xor rax, rax
.epilogue:
pop r14
pop r13
pop r12
pop rbp
ret
;; Numbers are sequences of numeric characters, interspersed with underscores
;; The leading character must be numeric
;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
is_number:
push rbp
mov rbp, rsp
push r12
push r13
push r14
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
add r12, rax
sub r13, rax
mov dil, [r12]
call is_numeric
test rax, rax
je .not_number
mov r14, 1
.loop:
cmp r14, r13
jge .number
mov dil, [r12 + r14]
cmp dil, '_'
je .loop_next
call is_numeric
test rax, rax
je .number
.loop_next:
inc r14
jmp .loop
.number:
mov rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, r14
jmp .epilogue
.not_number:
xor rax, rax
.epilogue:
pop r14
pop r13
pop r12
pop rbp
ret
skip_whitespaces:
push rbp
mov rbp, rsp
push r12
push r13
push r14
; let start = buffer.add(cursor);
; let end = buffer.add(buffer_len);
mov r12, [rel cursor]
mov r13, [rel buffer_len]
mov r14, [rel buffer]
; for ptr in start..end {
.loop:
cmp r12, r13
jge .done
mov dil, [r14 + r12]
call is_whitespace
test rax, rax
je .done
inc r12
jmp .loop
.done:
mov [rel cursor], r12
pop r14
pop r13
pop r12
pop rbp
ret
;; rdi: pointer to out-struct
;; fn find_lexeme() -> (u8, *const u8, usize)
find_lexeme:
push rbp
mov rbp, rsp
push rdi
; skip whitespaces
call skip_whitespaces
;; init out struct
mov rdi, [rsp]
mov rax, [rel buffer]
add rax, [rel cursor]
mov qword [rdi], 0
mov [rdi + 8], rax
mov qword [rdi + 16], 0
; check length
mov rax, [rel cursor]
mov rcx, [rel buffer_len]
; if cursor >= buffer_len {
cmp rax, rcx
jge .eof
jmp .start
.eof:
; return TOKEN_EOF;
mov rax, TOKEN_EOF
pop rdi
pop rbp
ret
; }
.start:
push r12
; test special tokens:
; if buffer[cursor] == '"' {
call is_string
test rax, rax
jne .is_string
; } else if buffer[cursor].is_numeric() {
call is_number
; return is_number();
test rax, rax
jne .is_number
; } else if buffer[cursor..][..2] == "//" {
call is_comment
; // skip to end of line
test rax, rax
jne .is_comment
; }
.loop_init:
mov r12, 1
; for 1..NUM_LEXEMES {
.loop:
cmp r12, [rel NUM_LEXEMES]
jge .not_found
; let lexeme = LEXEMES[i];
lea rdi, [rel LEXEMES]
mov rdi, [rdi + r12*8]
lea rdx, [rel LEXEME_LENS]
mov rsi, [rdx + r12*8]
mov rax, [rel cursor]
mov rdx, [rel buffer]
add rdx, rax
; let len = LEXEME_LENS[i];
mov rcx, [rel buffer_len]
sub rcx, rax
jo .not_found
; if lexeme.len() > buffer.len() - cursor {
cmp rsi, rcx
jg .next
; continue;
; }
mov rcx, rsi
; if buffer[cursor..cursor+len] == lexeme {
call streq
test rax, rax
je .next
; if is_ident() {
mov rdi, rsi
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else {
mov rdi, [rsp + 8]
mov rax, [rel cursor]
; cursor += len;
lea rsi, [rel LEXEME_LENS]
mov rsi, [rsi + r12*8]
add rax, rsi
mov [rel cursor], rax
; return TOKENS[i];
lea rax, [rel TOKENS]
mov al, [rax + r12]
and rax, 0xFF
mov rdi, [rsp + 8]
mov [rdi], al
mov [rdi + 16], rsi
jmp .epilogue
; }
.next:
inc r12
jmp .loop
; }
; }
.not_found:
; if is_ident() {
xor rdi, rdi
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else {
; return TOKEN_EOF;
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_EOF
; }
.epilogue:
pop r12
pop rdi
pop rbp
mov rax, rdi
ret
.is_ident:
; rax = len
; out.0 = TOKEN_IDENT
; out.1 = buffer.add(cursor - len)
; out.2 = len
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_IDENT
mov [rdi + 16], rax
jmp .epilogue
.is_number:
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_NUMBER
mov [rdi + 16], rax
jmp .epilogue
.is_string:
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_STRING
mov [rdi + 16], rax
jmp .epilogue
.is_comment:
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_COMMENT
mov [rdi + 16], rax
jmp .epilogue
;; dil: expected token
expect_token:
push rbp
mov rbp, rsp
sub rsp, 0x30
mov [rsp], dil
mov rax, [rel cursor] ; current cursor
mov [rsp + 8], rax
lea rdi, [rsp + 0x10]
call find_lexeme
mov rax, [rsp + 0x10] ; found token
mov dil, [rsp] ; expected token
cmp al, dil
je .matched
mov rdi, [rsp + 8] ; restore cursor
mov [rel cursor], rdi ; restore cursor
xor rax, rax
xor rdx, rdx
jmp .epilogue
.matched:
mov rax, [rsp + 0x18] ; lexeme pointer
mov rdx, [rsp + 0x20] ; lexeme length
.epilogue:
add rsp, 0x30
pop rbp
ret
;; dil: expected token
unwrap_token:
push rbp
mov rbp, rsp
call expect_token
test rax, rax
jz .panic
pop rbp
ret
.panic:
call panic
;; returns 0 if token not found, else returns lexeme (ptr, len)
;; dil: expected token
peek_expect_token:
push rbp
mov rbp, rsp
mov rax, [rel cursor]
push rax
call expect_token
pop rdi
mov [rel cursor], rdi
pop rbp
ret
;; rdi: out-struct pointer
peek_lexeme:
push rbp
mov rbp, rsp
push rdi
mov rax, [rel cursor] ; current cursor
push rax
call find_lexeme
pop rdi
mov [rel cursor], rdi ; restore cursor
pop rax
pop rbp
ret
tokeniser_get_cursor:
mov rax, [rel cursor]
ret
tokeniser_set_cursor:
mov [rel cursor], rdi
ret