597 lines
11 KiB
NASM
597 lines
11 KiB
NASM
section .text
|
|
extern panic
|
|
extern strlen
|
|
extern strcmp
|
|
extern streq
|
|
extern memcpy
|
|
extern eprint_str
|
|
extern exit
|
|
extern error_to_str
|
|
extern eprint_error
|
|
extern alloc_pages
|
|
extern allocate
|
|
extern fopen_read
|
|
|
|
extern is_alpha
|
|
extern is_numeric
|
|
extern is_id_continue
|
|
extern is_id_start
|
|
extern is_whitespace
|
|
|
|
global tokeniser_init
|
|
global tokeniser_init_buf
|
|
global tokeniser_print
|
|
global find_lexeme
|
|
global expect_token
|
|
global unwrap_token
|
|
global peek_expect_token
|
|
global peek_lexeme
|
|
|
|
global tokeniser_get_cursor
|
|
global tokeniser_set_cursor
|
|
|
|
;; =============================
|
|
;; Tokeniser functions
|
|
;; =============================
|
|
|
|
;; tokeniser state
|
|
section .data
|
|
global input_file
|
|
global buffer
|
|
global cursor
|
|
global buffer_len
|
|
|
|
input_file dd 0
|
|
buffer dq 0
|
|
cursor dq 0
|
|
buffer_len dq 0
|
|
|
|
;; each buffer is chunk_size bytes large
|
|
;; buffer header structure:
|
|
;; +0 (8 bytes): pointer buffer
|
|
;; +8 (8 bytes): size of buffer
|
|
|
|
;; Tokens:
|
|
;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ]
|
|
|
|
section .bss
|
|
statbuf: resb 144
|
|
|
|
section .text
|
|
|
|
;; rdi: pointer to buffer
|
|
;; rsi: length of buffer
|
|
tokeniser_init_buf:
|
|
push rbp
|
|
mov rbp, rsp
|
|
|
|
mov dword [rel input_file], 0
|
|
mov qword [rel buffer], rdi
|
|
mov qword [rel buffer_len], rsi
|
|
mov qword [rel cursor], 0
|
|
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;; Initialises the tokeniser
|
|
;; rdx: pointer to filename (null-terminated)
|
|
tokeniser_init:
|
|
; open file for reading
|
|
; this panics if the file doesn't exist
|
|
call fopen_read
|
|
mov dword [rel input_file], eax ; store file descriptor
|
|
mov qword [rel cursor], 0
|
|
mov qword [rel buffer_len], 0
|
|
; fstat
|
|
mov rax, 5 ; syscall: fstat
|
|
mov rdi, [rel input_file] ; fd
|
|
lea rsi, [rel statbuf] ; statbuf
|
|
syscall
|
|
cmp rax, 0
|
|
jl .report_error
|
|
; get file size from statbuf
|
|
lea r15, [rel statbuf] ; st_size
|
|
mov r15, [r15 + 48] ; offset of st_size in stat struct
|
|
|
|
; allocate buffer
|
|
mov rdi, r15
|
|
call allocate
|
|
mov qword [rel buffer], rax
|
|
mov qword [rel buffer_len], r15
|
|
|
|
; read file into buffer
|
|
mov rax, 0 ; syscall: read
|
|
mov edi, [rel input_file] ; fd
|
|
mov rsi, [rel buffer] ; buf
|
|
mov rdx, [rel buffer_len] ; count
|
|
syscall
|
|
cmp rax, 0
|
|
jl .report_error
|
|
ret
|
|
.report_error:
|
|
mov rcx, rax
|
|
call eprint_error
|
|
call panic
|
|
|
|
section .rdata
|
|
tokeniser_buffer db "Tokeniser buffer: ", 10
|
|
tokeniser_buffer_len equ $ - tokeniser_buffer
|
|
|
|
section .text
|
|
tokeniser_print:
|
|
lea rdi, [rel tokeniser_buffer]
|
|
mov rsi, tokeniser_buffer_len
|
|
call eprint_str
|
|
|
|
mov rax, [rel cursor]
|
|
mov rdi, [rel buffer]
|
|
add rdi, rax
|
|
mov rsi, [rel buffer_len]
|
|
call eprint_str
|
|
ret
|
|
|
|
section .rdata
|
|
global LEXEMES
|
|
global TOKENS
|
|
global LEXEME_LENS
|
|
global NUM_LEXEMES
|
|
|
|
%include "src/tokeniser.inc"
|
|
|
|
section .text
|
|
;; rdi: length of previously matched lexeme
|
|
;; returns the length of the ident
|
|
;; fn is_ident(lexeme_len: usize) -> usize
|
|
is_ident:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push rdi
|
|
|
|
mov rax, [rel cursor]
|
|
mov r12, [rel buffer]
|
|
mov r13, [rel buffer_len]
|
|
sub r13, rax
|
|
add r12, rax
|
|
|
|
; check first char is id_start
|
|
mov dil, [r12]
|
|
call is_id_start
|
|
test rax, rax
|
|
je .not_ident
|
|
mov r14, 1
|
|
.loop:
|
|
cmp r14, r13
|
|
jge .done
|
|
mov dil, [r12 + r14]
|
|
; check for id_continue
|
|
call is_id_continue
|
|
test rax, rax
|
|
je .done
|
|
inc r14
|
|
jmp .loop
|
|
.done:
|
|
; r14 is length of ident
|
|
mov rdi, [rsp]
|
|
cmp r14, rdi
|
|
jle .not_ident
|
|
mov rax, [rel cursor]
|
|
add rax, r14
|
|
mov [rel cursor], rax
|
|
mov rax, r14
|
|
jmp .epilogue
|
|
.not_ident:
|
|
xor rax, rax
|
|
.epilogue:
|
|
pop rdi
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
|
|
is_comment:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
|
|
mov rax, [rel cursor]
|
|
mov r12, [rel buffer]
|
|
mov r13, [rel buffer_len]
|
|
add r12, rax
|
|
sub r13, rax
|
|
|
|
mov dil, [r12]
|
|
cmp dil, '/'
|
|
jne .not_comment
|
|
|
|
mov r14, 1
|
|
cmp r14, r13
|
|
jge .not_comment
|
|
mov dil, [r12 + r14]
|
|
cmp dil, '/'
|
|
jne .not_comment
|
|
.loop:
|
|
inc r14
|
|
cmp r14, r13
|
|
jge .comment
|
|
mov dil, [r12 + r14]
|
|
cmp dil, 10 ; newline
|
|
jne .loop
|
|
.comment:
|
|
mov rax, [rel cursor]
|
|
add rax, r14
|
|
mov [rel cursor], rax
|
|
mov rax, r14
|
|
jmp .epilogue
|
|
|
|
.not_comment:
|
|
xor rax, rax
|
|
.epilogue:
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
|
|
;; Strings are sequences of characters enclosed in double quotes
|
|
;; Strings span multiple lines, and may in the future contain escape sequences
|
|
is_string:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
|
|
mov rax, [rel cursor]
|
|
mov r12, [rel buffer]
|
|
mov r13, [rel buffer_len]
|
|
add r12, rax
|
|
sub r13, rax
|
|
|
|
mov dil, [r12]
|
|
cmp dil, '"'
|
|
jne .not_string
|
|
|
|
mov r14, 1
|
|
.loop:
|
|
cmp r14, r13
|
|
jge .unterminated
|
|
mov dil, [r12 + r14]
|
|
cmp dil, '"'
|
|
je .string
|
|
cmp dil, 0x5c ; backslash
|
|
je .escape
|
|
inc r14
|
|
jmp .loop
|
|
.escape:
|
|
inc r14
|
|
cmp r14, r13
|
|
jge .unterminated
|
|
inc r14
|
|
jmp .loop
|
|
.string:
|
|
mov rax, [rel cursor]
|
|
inc r14 ; include closing quote
|
|
add rax, r14
|
|
mov [rel cursor], rax
|
|
mov rax, r14
|
|
jmp .epilogue
|
|
.unterminated:
|
|
;; TODO: report unterminated string error
|
|
mov rax, r14
|
|
jmp .epilogue
|
|
.not_string:
|
|
xor rax, rax
|
|
.epilogue:
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
|
|
;; Numbers are sequences of numeric characters, interspersed with underscores
|
|
;; The leading character must be numeric
|
|
;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
|
|
is_number:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
|
|
mov rax, [rel cursor]
|
|
mov r12, [rel buffer]
|
|
mov r13, [rel buffer_len]
|
|
add r12, rax
|
|
sub r13, rax
|
|
|
|
mov dil, [r12]
|
|
call is_numeric
|
|
test rax, rax
|
|
je .not_number
|
|
|
|
mov r14, 1
|
|
.loop:
|
|
cmp r14, r13
|
|
jge .number
|
|
mov dil, [r12 + r14]
|
|
cmp dil, '_'
|
|
je .loop_next
|
|
call is_numeric
|
|
test rax, rax
|
|
je .number
|
|
.loop_next:
|
|
inc r14
|
|
jmp .loop
|
|
.number:
|
|
mov rax, [rel cursor]
|
|
add rax, r14
|
|
mov [rel cursor], rax
|
|
mov rax, r14
|
|
jmp .epilogue
|
|
.not_number:
|
|
xor rax, rax
|
|
.epilogue:
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
|
|
skip_whitespaces:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
; let start = buffer.add(cursor);
|
|
; let end = buffer.add(buffer_len);
|
|
mov r12, [rel cursor]
|
|
mov r13, [rel buffer_len]
|
|
mov r14, [rel buffer]
|
|
; for ptr in start..end {
|
|
.loop:
|
|
cmp r12, r13
|
|
jge .done
|
|
mov dil, [r14 + r12]
|
|
call is_whitespace
|
|
test rax, rax
|
|
je .done
|
|
inc r12
|
|
jmp .loop
|
|
.done:
|
|
mov [rel cursor], r12
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
|
|
|
|
;; rdi: pointer to out-struct
|
|
;; fn find_lexeme() -> (u8, *const u8, usize)
|
|
find_lexeme:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push rdi
|
|
; skip whitespaces
|
|
call skip_whitespaces
|
|
;; init out struct
|
|
mov rdi, [rsp]
|
|
mov rax, [rel buffer]
|
|
add rax, [rel cursor]
|
|
mov qword [rdi], 0
|
|
mov [rdi + 8], rax
|
|
mov qword [rdi + 16], 0
|
|
; check length
|
|
mov rax, [rel cursor]
|
|
mov rcx, [rel buffer_len]
|
|
; if cursor >= buffer_len {
|
|
cmp rax, rcx
|
|
jge .eof
|
|
jmp .start
|
|
.eof:
|
|
; return TOKEN_EOF;
|
|
mov rax, TOKEN_EOF
|
|
pop rdi
|
|
pop rbp
|
|
ret
|
|
; }
|
|
.start:
|
|
push r12
|
|
; test special tokens:
|
|
; if buffer[cursor] == '"' {
|
|
call is_string
|
|
test rax, rax
|
|
jne .is_string
|
|
; } else if buffer[cursor].is_numeric() {
|
|
call is_number
|
|
; return is_number();
|
|
test rax, rax
|
|
jne .is_number
|
|
; } else if buffer[cursor..][..2] == "//" {
|
|
call is_comment
|
|
; // skip to end of line
|
|
test rax, rax
|
|
jne .is_comment
|
|
; }
|
|
.loop_init:
|
|
mov r12, 1
|
|
; for 1..NUM_LEXEMES {
|
|
.loop:
|
|
cmp r12, [rel NUM_LEXEMES]
|
|
jge .not_found
|
|
; let lexeme = LEXEMES[i];
|
|
lea rdi, [rel LEXEMES]
|
|
mov rdi, [rdi + r12*8]
|
|
lea rdx, [rel LEXEME_LENS]
|
|
mov rsi, [rdx + r12*8]
|
|
mov rax, [rel cursor]
|
|
mov rdx, [rel buffer]
|
|
add rdx, rax
|
|
; let len = LEXEME_LENS[i];
|
|
mov rcx, [rel buffer_len]
|
|
sub rcx, rax
|
|
jo .not_found
|
|
; if lexeme.len() > buffer.len() - cursor {
|
|
cmp rsi, rcx
|
|
jg .next
|
|
; continue;
|
|
; }
|
|
mov rcx, rsi
|
|
; if buffer[cursor..cursor+len] == lexeme {
|
|
call streq
|
|
test rax, rax
|
|
je .next
|
|
; if is_ident() {
|
|
mov rdi, rsi
|
|
call is_ident
|
|
test rax, rax
|
|
; return TOKEN_IDENT;
|
|
jne .is_ident
|
|
; } else {
|
|
mov rdi, [rsp + 8]
|
|
mov rax, [rel cursor]
|
|
; cursor += len;
|
|
lea rsi, [rel LEXEME_LENS]
|
|
mov rsi, [rsi + r12*8]
|
|
add rax, rsi
|
|
mov [rel cursor], rax
|
|
; return TOKENS[i];
|
|
lea rax, [rel TOKENS]
|
|
mov al, [rax + r12]
|
|
and rax, 0xFF
|
|
mov rdi, [rsp + 8]
|
|
mov [rdi], al
|
|
mov [rdi + 16], rsi
|
|
jmp .epilogue
|
|
; }
|
|
.next:
|
|
inc r12
|
|
jmp .loop
|
|
; }
|
|
; }
|
|
.not_found:
|
|
; if is_ident() {
|
|
xor rdi, rdi
|
|
call is_ident
|
|
test rax, rax
|
|
; return TOKEN_IDENT;
|
|
jne .is_ident
|
|
; } else {
|
|
; return TOKEN_EOF;
|
|
mov rdi, [rsp + 8]
|
|
mov qword [rdi], TOKEN_EOF
|
|
; }
|
|
.epilogue:
|
|
pop r12
|
|
pop rdi
|
|
pop rbp
|
|
mov rax, rdi
|
|
ret
|
|
.is_ident:
|
|
; rax = len
|
|
; out.0 = TOKEN_IDENT
|
|
; out.1 = buffer.add(cursor - len)
|
|
; out.2 = len
|
|
mov rdi, [rsp + 8]
|
|
mov qword [rdi], TOKEN_IDENT
|
|
mov [rdi + 16], rax
|
|
jmp .epilogue
|
|
.is_number:
|
|
mov rdi, [rsp + 8]
|
|
mov qword [rdi], TOKEN_NUMBER
|
|
mov [rdi + 16], rax
|
|
jmp .epilogue
|
|
.is_string:
|
|
mov rdi, [rsp + 8]
|
|
mov qword [rdi], TOKEN_STRING
|
|
mov [rdi + 16], rax
|
|
jmp .epilogue
|
|
.is_comment:
|
|
|
|
mov rdi, [rsp + 8]
|
|
mov qword [rdi], TOKEN_COMMENT
|
|
mov [rdi + 16], rax
|
|
jmp .epilogue
|
|
|
|
;; dil: expected token
|
|
expect_token:
|
|
push rbp
|
|
mov rbp, rsp
|
|
sub rsp, 0x30
|
|
mov [rsp], dil
|
|
mov rax, [rel cursor] ; current cursor
|
|
mov [rsp + 8], rax
|
|
lea rdi, [rsp + 0x10]
|
|
call find_lexeme
|
|
mov rax, [rsp + 0x10] ; found token
|
|
mov dil, [rsp] ; expected token
|
|
cmp al, dil
|
|
je .matched
|
|
mov rdi, [rsp + 8] ; restore cursor
|
|
mov [rel cursor], rdi ; restore cursor
|
|
xor rax, rax
|
|
xor rdx, rdx
|
|
jmp .epilogue
|
|
.matched:
|
|
mov rax, [rsp + 0x18] ; lexeme pointer
|
|
mov rdx, [rsp + 0x20] ; lexeme length
|
|
.epilogue:
|
|
add rsp, 0x30
|
|
pop rbp
|
|
ret
|
|
|
|
;; dil: expected token
|
|
unwrap_token:
|
|
push rbp
|
|
mov rbp, rsp
|
|
call expect_token
|
|
test rax, rax
|
|
jz .panic
|
|
pop rbp
|
|
ret
|
|
.panic:
|
|
call panic
|
|
|
|
;; returns 0 if token not found, else returns lexeme (ptr, len)
|
|
;; dil: expected token
|
|
peek_expect_token:
|
|
push rbp
|
|
mov rbp, rsp
|
|
mov rax, [rel cursor]
|
|
push rax
|
|
call expect_token
|
|
pop rdi
|
|
mov [rel cursor], rdi
|
|
pop rbp
|
|
ret
|
|
|
|
;; rdi: out-struct pointer
|
|
peek_lexeme:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push rdi
|
|
mov rax, [rel cursor] ; current cursor
|
|
push rax
|
|
call find_lexeme
|
|
pop rdi
|
|
mov [rel cursor], rdi ; restore cursor
|
|
pop rax
|
|
pop rbp
|
|
ret
|
|
|
|
tokeniser_get_cursor:
|
|
mov rax, [rel cursor]
|
|
ret
|
|
|
|
tokeniser_set_cursor:
|
|
mov [rel cursor], rdi
|
|
ret
|