579 lines
12 KiB
NASM
579 lines
12 KiB
NASM
;; Compile with:
|
|
;; nasm -f elf64 main.asm -o main.o
|
|
extern int_to_str
|
|
extern oom
|
|
extern panic
|
|
extern strlen
|
|
extern strcmp
|
|
extern streq
|
|
extern memcpy
|
|
extern eprint_str
|
|
extern exit
|
|
extern error_to_str
|
|
extern eprint_error
|
|
extern alloc_pages
|
|
extern allocate
|
|
extern fopen_read
|
|
|
|
extern is_alpha
|
|
extern is_numeric
|
|
extern is_id_continue
|
|
extern is_id_start
|
|
|
|
section .data
|
|
hello_msg db "Hello, World!", 10
|
|
hello_msg_len equ $ - hello_msg
|
|
file_error_msg db "Could not open file: "
|
|
file_error_msg_len equ $ - file_error_msg
|
|
test_success db "All tests passed!", 10
|
|
test_success_len equ $ - test_success
|
|
|
|
section .text
|
|
global _start
|
|
_start:
|
|
mov rdi, test_success
|
|
mov rsi, test_success_len
|
|
call eprint_str
|
|
|
|
mov rdi, 0
|
|
call exit
|
|
|
|
|
|
compiler_entry:
|
|
; get filename from argv[1]
|
|
; argv is at rsp + 8
|
|
; check if argc > 1
|
|
mov rcx, hello_msg
|
|
mov rdx, hello_msg_len
|
|
call eprint_str
|
|
mov rax, [rsp] ; argc
|
|
cmp rax, 1
|
|
jle .no_filename ; if argc <= 1, no filename provided
|
|
; get argv[1]
|
|
mov rax, [rsp + 16] ; argv[1]
|
|
|
|
; init tokeniser
|
|
mov rdx, rax ; rdx = pointer to filename
|
|
call tokeniser_init
|
|
call tokeniser_print
|
|
.loop:
|
|
call tokeniser_next_token
|
|
cmp rax, 0
|
|
je .exit
|
|
mov rcx, [LEXEMES + rax*8]
|
|
mov rdx, [LEXEME_LENS + rax*8]
|
|
call eprint_str
|
|
jmp .loop
|
|
|
|
.no_filename:
|
|
call panic
|
|
.exit:
|
|
call exit
|
|
|
|
;; =============================
|
|
;; Tokeniser functions
|
|
;; =============================
|
|
|
|
;; tokeniser state
|
|
section .data
|
|
input_file dd 0
|
|
buffer dq 0
|
|
cursor dq 0
|
|
buffer_len dq 0
|
|
|
|
;; each buffer is chunk_size bytes large
|
|
;; buffer header structure:
|
|
;; +0 (8 bytes): pointer buffer
|
|
;; +8 (8 bytes): size of buffer
|
|
|
|
;; Tokens:
|
|
;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ]
|
|
|
|
section .bss
|
|
statbuf: resb 144
|
|
|
|
section .text
|
|
;; Initialises the tokeniser
|
|
;; rdx: pointer to filename (null-terminated)
|
|
tokeniser_init:
|
|
; open file for reading
|
|
; this panics if the file doesn't exist
|
|
call fopen_read
|
|
mov [input_file], eax ; store file descriptor
|
|
mov dword [cursor], 0
|
|
mov dword [buffer_len], 0
|
|
; fstat
|
|
mov rax, 5 ; syscall: fstat
|
|
mov rdi, [input_file] ; fd
|
|
lea rsi, [statbuf] ; statbuf
|
|
syscall
|
|
cmp rax, 0
|
|
jl .report_error
|
|
; get file size from statbuf
|
|
mov r15, [statbuf + 48] ; st_size
|
|
|
|
; allocate buffer
|
|
mov rcx, r15
|
|
call allocate
|
|
mov [buffer], rax
|
|
mov [buffer_len], r15
|
|
|
|
; read file into buffer
|
|
mov rax, 0 ; syscall: read
|
|
mov rdi, [input_file] ; fd
|
|
mov rsi, [buffer] ; buf
|
|
mov rdx, [buffer_len] ; count
|
|
syscall
|
|
cmp rax, 0
|
|
jl .report_error
|
|
ret
|
|
.report_error:
|
|
mov rcx, rax
|
|
call eprint_error
|
|
call panic
|
|
|
|
section .rdata
|
|
tokeniser_buffer db "Tokeniser buffer: ", 10
|
|
tokeniser_buffer_len equ $ - tokeniser_buffer
|
|
|
|
section .text
|
|
tokeniser_print:
|
|
mov rcx, tokeniser_buffer
|
|
mov rdx, tokeniser_buffer_len
|
|
call eprint_str
|
|
|
|
mov rax, [cursor]
|
|
mov rcx, [buffer + rax]
|
|
mov rdx, [buffer_len]
|
|
call eprint_str
|
|
|
|
section .rdata
|
|
LEXEMES: dq \
|
|
LEX_NOT_A_LEXEME, \
|
|
LEX_LET, \
|
|
LEX_IF, \
|
|
LEX_ELSE, \
|
|
LEX_FN, \
|
|
LEX_RETURN, \
|
|
LEX_LOOP, \
|
|
LEX_BREAK, \
|
|
LEX_CONTINUE, \
|
|
LEX_TRUE, \
|
|
LEX_FALSE, \
|
|
LEX_BOOL, \
|
|
LEX_ARROW, \
|
|
LEX_I32, \
|
|
LEX_U32, \
|
|
LEX_EQUALS, \
|
|
LEX_PLUS, \
|
|
LEX_MINUS, \
|
|
LEX_RPARENS, \
|
|
LEX_LPARENS, \
|
|
LEX_RBRACE, \
|
|
LEX_LBRACE, \
|
|
LEX_COLON, \
|
|
LEX_SEMI, \
|
|
LEX_COMMA, \
|
|
LEX_PIPE, \
|
|
LEX_AMP, \
|
|
LEX_EQEQ
|
|
TOKENS: db \
|
|
TOKEN_EOF, \
|
|
TOKEN_LET, \
|
|
TOKEN_IF, \
|
|
TOKEN_ELSE, \
|
|
TOKEN_FN, \
|
|
TOKEN_RETURN, \
|
|
TOKEN_LOOP, \
|
|
TOKEN_BREAK, \
|
|
TOKEN_CONTINUE, \
|
|
TOKEN_TRUE, \
|
|
TOKEN_FALSE, \
|
|
TOKEN_BOOL, \
|
|
TOKEN_ARROW, \
|
|
TOKEN_I32, \
|
|
TOKEN_U32, \
|
|
TOKEN_EQUALS, \
|
|
TOKEN_PLUS, \
|
|
TOKEN_MINUS, \
|
|
TOKEN_RPARENS, \
|
|
TOKEN_LPARENS, \
|
|
TOKEN_RBRACE, \
|
|
TOKEN_LBRACE, \
|
|
TOKEN_COLON, \
|
|
TOKEN_SEMI, \
|
|
TOKEN_COMMA, \
|
|
TOKEN_PIPE, \
|
|
TOKEN_AMP, \
|
|
TOKEN_EQEQ
|
|
LEXEME_LENS: dq \
|
|
0, \
|
|
LEX_LET_len, \
|
|
LEX_IF_len, \
|
|
LEX_ELSE_len, \
|
|
LEX_FN_len, \
|
|
LEX_RETURN_len, \
|
|
LEX_LOOP_len, \
|
|
LEX_BREAK_len, \
|
|
LEX_CONTINUE_len, \
|
|
LEX_TRUE_len, \
|
|
LEX_FALSE_len, \
|
|
LEX_BOOL_len, \
|
|
LEX_ARROW_len, \
|
|
LEX_I32_len, \
|
|
LEX_U32_len, \
|
|
LEX_EQUALS_len, \
|
|
LEX_PLUS_len, \
|
|
LEX_MINUS_len, \
|
|
LEX_RPARENS_len, \
|
|
LEX_LPARENS_len, \
|
|
LEX_RBRACE_len, \
|
|
LEX_LBRACE_len, \
|
|
LEX_COLON_len, \
|
|
LEX_SEMI_len, \
|
|
LEX_COMMA_len, \
|
|
LEX_PIPE_len, \
|
|
LEX_AMP_len, \
|
|
LEX_EQEQ_len
|
|
|
|
NUM_LEXEMES equ 28
|
|
|
|
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
|
|
TOKEN_EOF equ 0
|
|
TOKEN_LET equ 1
|
|
LEX_LET db "let"
|
|
LEX_LET_len equ $ - LEX_LET
|
|
TOKEN_IF equ 2
|
|
LEX_IF db "if"
|
|
LEX_IF_len equ $ - LEX_IF
|
|
TOKEN_ELSE equ 3
|
|
LEX_ELSE db "else"
|
|
LEX_ELSE_len equ $ - LEX_ELSE
|
|
TOKEN_FN equ 4
|
|
LEX_FN db "fn"
|
|
LEX_FN_len equ $ - LEX_FN
|
|
TOKEN_RETURN equ 5
|
|
LEX_RETURN db "return"
|
|
LEX_RETURN_len equ $ - LEX_RETURN
|
|
TOKEN_LOOP equ 6
|
|
LEX_LOOP db "loop"
|
|
LEX_LOOP_len equ $ - LEX_LOOP
|
|
TOKEN_BREAK equ 7
|
|
LEX_BREAK db "break"
|
|
LEX_BREAK_len equ $ - LEX_BREAK
|
|
TOKEN_CONTINUE equ 8
|
|
LEX_CONTINUE db "continue"
|
|
LEX_CONTINUE_len equ $ - LEX_CONTINUE
|
|
TOKEN_TRUE equ 9
|
|
LEX_TRUE db "true"
|
|
LEX_TRUE_len equ $ - LEX_TRUE
|
|
TOKEN_FALSE equ 10
|
|
LEX_FALSE db "false"
|
|
LEX_FALSE_len equ $ - LEX_FALSE
|
|
TOKEN_BOOL equ 11
|
|
LEX_BOOL db "bool"
|
|
LEX_BOOL_len equ $ - LEX_BOOL
|
|
TOKEN_ARROW equ 12
|
|
LEX_ARROW db "->"
|
|
LEX_ARROW_len equ $ - LEX_ARROW
|
|
TOKEN_I32 equ 13
|
|
LEX_I32 db "i32"
|
|
LEX_I32_len equ $ - LEX_I32
|
|
TOKEN_U32 equ 14
|
|
LEX_U32 db "u32"
|
|
LEX_U32_len equ $ - LEX_U32
|
|
TOKEN_EQUALS equ 15
|
|
LEX_EQUALS db "="
|
|
LEX_EQUALS_len equ $ - LEX_EQUALS
|
|
TOKEN_PLUS equ 16
|
|
LEX_PLUS db "+"
|
|
LEX_PLUS_len equ $ - LEX_PLUS
|
|
TOKEN_MINUS equ 17
|
|
LEX_MINUS db "-"
|
|
LEX_MINUS_len equ $ - LEX_MINUS
|
|
TOKEN_RPARENS equ 18
|
|
LEX_RPARENS db ")"
|
|
LEX_RPARENS_len equ $ - LEX_RPARENS
|
|
TOKEN_LPARENS equ 19
|
|
LEX_LPARENS db "("
|
|
LEX_LPARENS_len equ $ - LEX_LPARENS
|
|
TOKEN_RBRACE equ 20
|
|
LEX_RBRACE db "}"
|
|
LEX_RBRACE_len equ $ - LEX_RBRACE
|
|
TOKEN_LBRACE equ 21
|
|
LEX_LBRACE db "{"
|
|
LEX_LBRACE_len equ $ - LEX_LBRACE
|
|
TOKEN_COLON equ 22
|
|
LEX_COLON db ":"
|
|
LEX_COLON_len equ $ - LEX_COLON
|
|
TOKEN_SEMI equ 23
|
|
LEX_SEMI db ";"
|
|
LEX_SEMI_len equ $ - LEX_SEMI
|
|
TOKEN_COMMA equ 24
|
|
LEX_COMMA db ","
|
|
LEX_COMMA_len equ $ - LEX_COMMA
|
|
TOKEN_PIPE equ 25
|
|
LEX_PIPE db "|"
|
|
LEX_PIPE_len equ $ - LEX_PIPE
|
|
TOKEN_AMP equ 26
|
|
LEX_AMP db "&"
|
|
LEX_AMP_len equ $ - LEX_AMP
|
|
TOKEN_EQEQ equ 27
|
|
LEX_EQEQ db "=="
|
|
LEX_EQEQ_len equ $ - LEX_EQEQ
|
|
TOKEN_IDENT equ 28
|
|
LEX_IDENT db "<identifier>"
|
|
LEX_IDENT_len equ $ - LEX_IDENT
|
|
TOKEN_NUMBER equ 29
|
|
LEX_NUMBER db "<number>"
|
|
LEX_NUMBER_len equ $ - LEX_NUMBER
|
|
|
|
section .text
|
|
|
|
;; rdi: length of matched lexeme
|
|
is_ident:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push rdi
|
|
lea rax, [rel cursor]
|
|
lea r12, [rel buffer]
|
|
lea r13, [rel buffer_len]
|
|
sub r13, rax
|
|
add r12, rax
|
|
|
|
; check first char is id_start
|
|
mov dil, [r12]
|
|
call is_id_start
|
|
test rax, rax
|
|
je .not_ident
|
|
xor r14, r14
|
|
.loop:
|
|
cmp r14, r13
|
|
jge .done
|
|
mov dil, [r12 + r14]
|
|
; check for id_continue
|
|
call is_id_continue
|
|
test rax, rax
|
|
je .done
|
|
inc r14
|
|
jmp .loop
|
|
.done:
|
|
; r14 is length of ident
|
|
mov rdi, [rsp]
|
|
cmp r14, rdi
|
|
jle .not_ident
|
|
lea rax, [rel cursor]
|
|
add rax, r14
|
|
mov [rel cursor], rax
|
|
mov rax, 1
|
|
jmp .epilogue
|
|
.not_ident:
|
|
xor rax, rax
|
|
.epilogue:
|
|
pop rdi
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
|
|
is_number:
|
|
xor rax, rax
|
|
ret
|
|
|
|
find_lexeme:
|
|
push rbp
|
|
mov rbp, rsp
|
|
push r12
|
|
xor r12, r12
|
|
; for i..NUM_LEXEMES {
|
|
.loop:
|
|
cmp r12, NUM_LEXEMES
|
|
jge .not_found
|
|
; let lexeme = LEXEMES[i];
|
|
lea rdi, [rel LEXEMES + r12*8]
|
|
lea rax, [rel cursor]
|
|
lea rsi, [rel buffer]
|
|
add rsi, rax
|
|
; let len = LEXEME_LENS[i];
|
|
lea rdx, [LEXEME_LENS + r12*8]
|
|
lea rcx, [rel buffer_len]
|
|
sub rcx, rdx
|
|
sub rcx, rax
|
|
jo .next
|
|
; if buffer[cursor..cursor+len] == lexeme {
|
|
call streq
|
|
test rax, rax
|
|
jne .next
|
|
; if is_ident() {
|
|
call is_ident
|
|
test rax, rax
|
|
; return TOKEN_IDENT;
|
|
jne .is_ident
|
|
; } else if is_number() {
|
|
call is_number
|
|
test rax, rax
|
|
; return TOKEN_NUMBER;
|
|
jne .is_number
|
|
; } else {
|
|
lea rax, [rel cursor]
|
|
; cursor += len;
|
|
lea rdi, [rel LEXEME_LENS + r12*8]
|
|
add rax, rdi
|
|
mov [rel cursor], rax
|
|
; return TOKENS[i];
|
|
lea rax, [rel TOKENS + r12*4]
|
|
jmp .epilogue
|
|
; }
|
|
.next:
|
|
inc r12
|
|
jmp .loop
|
|
; }
|
|
; }
|
|
.not_found:
|
|
; if is_ident() {
|
|
call is_ident
|
|
test rax, rax
|
|
; return TOKEN_IDENT;
|
|
jne .is_ident
|
|
; } else if is_number() {
|
|
call is_number
|
|
test rax, rax
|
|
; return TOKEN_NUMBER;
|
|
jne .is_number
|
|
; } else {
|
|
; return TOKEN_EOF;
|
|
mov rax, TOKEN_EOF
|
|
; }
|
|
.epilogue:
|
|
pop r12
|
|
pop rbp
|
|
ret
|
|
.is_ident:
|
|
mov rax, TOKEN_IDENT
|
|
jmp .epilogue
|
|
.is_number:
|
|
mov rax, TOKEN_NUMBER
|
|
jmp .epilogue
|
|
|
|
|
|
;; rcx: lexeme index
|
|
;; Returns:
|
|
;; rax: token if matched, 0 if not matched
|
|
try_lexeme:
|
|
push r8
|
|
push r15
|
|
push rcx ; save lexeme index
|
|
; compare lexeme with cursor
|
|
mov r9, [cursor]
|
|
mov r8, [buffer]
|
|
add r8, r9
|
|
mov rax, [buffer_len]
|
|
sub rax, r9
|
|
mov rdx, [LEXEME_LENS + rcx*8]
|
|
cmp rax, rdx
|
|
jl .not_equal
|
|
; compare memory
|
|
mov rcx, [LEXEMES + rcx*8]
|
|
mov r9, rdx
|
|
call streq
|
|
cmp rax, 1
|
|
jne .not_equal
|
|
; check if it could be an ident:
|
|
; the buffer must not have ended
|
|
; the next char must be id_continue
|
|
; first char must be id_start
|
|
mov rax, [cursor]
|
|
add rax, rdx
|
|
cmp rax, [buffer_len]
|
|
jge .not_ident
|
|
mov cl, [buffer + rax]
|
|
call is_id_continue
|
|
cmp rax, 1
|
|
jne .not_ident
|
|
; check first char
|
|
mov rax, [cursor]
|
|
mov cl, [buffer + rax]
|
|
call is_id_start
|
|
cmp rax, 1
|
|
jne .not_ident
|
|
; this is an ident
|
|
; move cursor forward while is_id_continue
|
|
mov r15, [cursor]
|
|
add r15, rdx
|
|
.try_lexeme_loop:
|
|
cmp r15, [buffer_len]
|
|
jge .done_ident
|
|
mov cl, [buffer + r15]
|
|
call is_id_continue
|
|
cmp rax, 1
|
|
jne .done_ident
|
|
inc r15
|
|
jmp .try_lexeme_loop
|
|
.done_ident:
|
|
mov [cursor], r15
|
|
pop rcx
|
|
pop r15
|
|
pop r8
|
|
mov rax, TOKEN_IDENT
|
|
ret
|
|
.not_ident:
|
|
mov rax, [cursor]
|
|
add rax, rdx
|
|
mov [cursor], rax
|
|
pop rcx
|
|
pop r15
|
|
pop r8
|
|
mov rax, rcx
|
|
ret
|
|
.not_equal:
|
|
pop rcx
|
|
pop r15
|
|
pop r8
|
|
xor rax, rax
|
|
ret
|
|
|
|
;; Returns: token enumerator
|
|
tokeniser_next_token:
|
|
; check if at end of buffer
|
|
.loop:
|
|
mov rax, [cursor]
|
|
cmp rax, [buffer_len]
|
|
jge .eof
|
|
|
|
; get next 4 bytes
|
|
mov rbx, [buffer]
|
|
add rbx, rax
|
|
mov bl, byte [rbx]
|
|
|
|
; skip whitespace
|
|
cmp bl, ' '
|
|
je .skip
|
|
|
|
mov r15, 1 ; lexeme index
|
|
.inner_loop:
|
|
cmp r15, NUM_LEXEMES
|
|
; TODO: numbers, idents
|
|
jge .skip
|
|
; try lexeme
|
|
mov rcx, r15
|
|
call try_lexeme
|
|
cmp rax, 0
|
|
jne .return_token
|
|
inc r15
|
|
jmp .inner_loop
|
|
.skip:
|
|
mov rax, [cursor]
|
|
inc rax
|
|
mov [cursor], rax
|
|
jmp .loop
|
|
.eof:
|
|
mov rax, TOKEN_EOF
|
|
ret
|
|
.return_token:
|
|
mov rax, r15
|
|
ret
|