from-scratch/lang/src/main.asm

579 lines
12 KiB
NASM

;; Compile with:
;; nasm -f elf64 main.asm -o main.o
extern int_to_str
extern oom
extern panic
extern strlen
extern strcmp
extern streq
extern memcpy
extern eprint_str
extern exit
extern error_to_str
extern eprint_error
extern alloc_pages
extern allocate
extern fopen_read
extern is_alpha
extern is_numeric
extern is_id_continue
extern is_id_start
section .data
hello_msg db "Hello, World!", 10
hello_msg_len equ $ - hello_msg
file_error_msg db "Could not open file: "
file_error_msg_len equ $ - file_error_msg
test_success db "All tests passed!", 10
test_success_len equ $ - test_success
section .text
global _start
_start:
mov rdi, test_success
mov rsi, test_success_len
call eprint_str
mov rdi, 0
call exit
compiler_entry:
; get filename from argv[1]
; argv is at rsp + 8
; check if argc > 1
mov rcx, hello_msg
mov rdx, hello_msg_len
call eprint_str
mov rax, [rsp] ; argc
cmp rax, 1
jle .no_filename ; if argc <= 1, no filename provided
; get argv[1]
mov rax, [rsp + 16] ; argv[1]
; init tokeniser
mov rdx, rax ; rdx = pointer to filename
call tokeniser_init
call tokeniser_print
.loop:
call tokeniser_next_token
cmp rax, 0
je .exit
mov rcx, [LEXEMES + rax*8]
mov rdx, [LEXEME_LENS + rax*8]
call eprint_str
jmp .loop
.no_filename:
call panic
.exit:
call exit
;; =============================
;; Tokeniser functions
;; =============================
;; tokeniser state
section .data
input_file dd 0
buffer dq 0
cursor dq 0
buffer_len dq 0
;; each buffer is chunk_size bytes large
;; buffer header structure:
;; +0 (8 bytes): pointer buffer
;; +8 (8 bytes): size of buffer
;; Tokens:
;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ]
section .bss
statbuf: resb 144
section .text
;; Initialises the tokeniser
;; rdx: pointer to filename (null-terminated)
tokeniser_init:
; open file for reading
; this panics if the file doesn't exist
call fopen_read
mov [input_file], eax ; store file descriptor
mov dword [cursor], 0
mov dword [buffer_len], 0
; fstat
mov rax, 5 ; syscall: fstat
mov rdi, [input_file] ; fd
lea rsi, [statbuf] ; statbuf
syscall
cmp rax, 0
jl .report_error
; get file size from statbuf
mov r15, [statbuf + 48] ; st_size
; allocate buffer
mov rcx, r15
call allocate
mov [buffer], rax
mov [buffer_len], r15
; read file into buffer
mov rax, 0 ; syscall: read
mov rdi, [input_file] ; fd
mov rsi, [buffer] ; buf
mov rdx, [buffer_len] ; count
syscall
cmp rax, 0
jl .report_error
ret
.report_error:
mov rcx, rax
call eprint_error
call panic
section .rdata
tokeniser_buffer db "Tokeniser buffer: ", 10
tokeniser_buffer_len equ $ - tokeniser_buffer
section .text
tokeniser_print:
mov rcx, tokeniser_buffer
mov rdx, tokeniser_buffer_len
call eprint_str
mov rax, [cursor]
mov rcx, [buffer + rax]
mov rdx, [buffer_len]
call eprint_str
section .rdata
LEXEMES: dq \
LEX_NOT_A_LEXEME, \
LEX_LET, \
LEX_IF, \
LEX_ELSE, \
LEX_FN, \
LEX_RETURN, \
LEX_LOOP, \
LEX_BREAK, \
LEX_CONTINUE, \
LEX_TRUE, \
LEX_FALSE, \
LEX_BOOL, \
LEX_ARROW, \
LEX_I32, \
LEX_U32, \
LEX_EQUALS, \
LEX_PLUS, \
LEX_MINUS, \
LEX_RPARENS, \
LEX_LPARENS, \
LEX_RBRACE, \
LEX_LBRACE, \
LEX_COLON, \
LEX_SEMI, \
LEX_COMMA, \
LEX_PIPE, \
LEX_AMP, \
LEX_EQEQ
TOKENS: db \
TOKEN_EOF, \
TOKEN_LET, \
TOKEN_IF, \
TOKEN_ELSE, \
TOKEN_FN, \
TOKEN_RETURN, \
TOKEN_LOOP, \
TOKEN_BREAK, \
TOKEN_CONTINUE, \
TOKEN_TRUE, \
TOKEN_FALSE, \
TOKEN_BOOL, \
TOKEN_ARROW, \
TOKEN_I32, \
TOKEN_U32, \
TOKEN_EQUALS, \
TOKEN_PLUS, \
TOKEN_MINUS, \
TOKEN_RPARENS, \
TOKEN_LPARENS, \
TOKEN_RBRACE, \
TOKEN_LBRACE, \
TOKEN_COLON, \
TOKEN_SEMI, \
TOKEN_COMMA, \
TOKEN_PIPE, \
TOKEN_AMP, \
TOKEN_EQEQ
LEXEME_LENS: dq \
0, \
LEX_LET_len, \
LEX_IF_len, \
LEX_ELSE_len, \
LEX_FN_len, \
LEX_RETURN_len, \
LEX_LOOP_len, \
LEX_BREAK_len, \
LEX_CONTINUE_len, \
LEX_TRUE_len, \
LEX_FALSE_len, \
LEX_BOOL_len, \
LEX_ARROW_len, \
LEX_I32_len, \
LEX_U32_len, \
LEX_EQUALS_len, \
LEX_PLUS_len, \
LEX_MINUS_len, \
LEX_RPARENS_len, \
LEX_LPARENS_len, \
LEX_RBRACE_len, \
LEX_LBRACE_len, \
LEX_COLON_len, \
LEX_SEMI_len, \
LEX_COMMA_len, \
LEX_PIPE_len, \
LEX_AMP_len, \
LEX_EQEQ_len
NUM_LEXEMES equ 28
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
TOKEN_EOF equ 0
TOKEN_LET equ 1
LEX_LET db "let"
LEX_LET_len equ $ - LEX_LET
TOKEN_IF equ 2
LEX_IF db "if"
LEX_IF_len equ $ - LEX_IF
TOKEN_ELSE equ 3
LEX_ELSE db "else"
LEX_ELSE_len equ $ - LEX_ELSE
TOKEN_FN equ 4
LEX_FN db "fn"
LEX_FN_len equ $ - LEX_FN
TOKEN_RETURN equ 5
LEX_RETURN db "return"
LEX_RETURN_len equ $ - LEX_RETURN
TOKEN_LOOP equ 6
LEX_LOOP db "loop"
LEX_LOOP_len equ $ - LEX_LOOP
TOKEN_BREAK equ 7
LEX_BREAK db "break"
LEX_BREAK_len equ $ - LEX_BREAK
TOKEN_CONTINUE equ 8
LEX_CONTINUE db "continue"
LEX_CONTINUE_len equ $ - LEX_CONTINUE
TOKEN_TRUE equ 9
LEX_TRUE db "true"
LEX_TRUE_len equ $ - LEX_TRUE
TOKEN_FALSE equ 10
LEX_FALSE db "false"
LEX_FALSE_len equ $ - LEX_FALSE
TOKEN_BOOL equ 11
LEX_BOOL db "bool"
LEX_BOOL_len equ $ - LEX_BOOL
TOKEN_ARROW equ 12
LEX_ARROW db "->"
LEX_ARROW_len equ $ - LEX_ARROW
TOKEN_I32 equ 13
LEX_I32 db "i32"
LEX_I32_len equ $ - LEX_I32
TOKEN_U32 equ 14
LEX_U32 db "u32"
LEX_U32_len equ $ - LEX_U32
TOKEN_EQUALS equ 15
LEX_EQUALS db "="
LEX_EQUALS_len equ $ - LEX_EQUALS
TOKEN_PLUS equ 16
LEX_PLUS db "+"
LEX_PLUS_len equ $ - LEX_PLUS
TOKEN_MINUS equ 17
LEX_MINUS db "-"
LEX_MINUS_len equ $ - LEX_MINUS
TOKEN_RPARENS equ 18
LEX_RPARENS db ")"
LEX_RPARENS_len equ $ - LEX_RPARENS
TOKEN_LPARENS equ 19
LEX_LPARENS db "("
LEX_LPARENS_len equ $ - LEX_LPARENS
TOKEN_RBRACE equ 20
LEX_RBRACE db "}"
LEX_RBRACE_len equ $ - LEX_RBRACE
TOKEN_LBRACE equ 21
LEX_LBRACE db "{"
LEX_LBRACE_len equ $ - LEX_LBRACE
TOKEN_COLON equ 22
LEX_COLON db ":"
LEX_COLON_len equ $ - LEX_COLON
TOKEN_SEMI equ 23
LEX_SEMI db ";"
LEX_SEMI_len equ $ - LEX_SEMI
TOKEN_COMMA equ 24
LEX_COMMA db ","
LEX_COMMA_len equ $ - LEX_COMMA
TOKEN_PIPE equ 25
LEX_PIPE db "|"
LEX_PIPE_len equ $ - LEX_PIPE
TOKEN_AMP equ 26
LEX_AMP db "&"
LEX_AMP_len equ $ - LEX_AMP
TOKEN_EQEQ equ 27
LEX_EQEQ db "=="
LEX_EQEQ_len equ $ - LEX_EQEQ
TOKEN_IDENT equ 28
LEX_IDENT db "<identifier>"
LEX_IDENT_len equ $ - LEX_IDENT
TOKEN_NUMBER equ 29
LEX_NUMBER db "<number>"
LEX_NUMBER_len equ $ - LEX_NUMBER
section .text
;; rdi: length of matched lexeme
is_ident:
push rbp
mov rbp, rsp
push r12
push r13
push r14
push rdi
lea rax, [rel cursor]
lea r12, [rel buffer]
lea r13, [rel buffer_len]
sub r13, rax
add r12, rax
; check first char is id_start
mov dil, [r12]
call is_id_start
test rax, rax
je .not_ident
xor r14, r14
.loop:
cmp r14, r13
jge .done
mov dil, [r12 + r14]
; check for id_continue
call is_id_continue
test rax, rax
je .done
inc r14
jmp .loop
.done:
; r14 is length of ident
mov rdi, [rsp]
cmp r14, rdi
jle .not_ident
lea rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, 1
jmp .epilogue
.not_ident:
xor rax, rax
.epilogue:
pop rdi
pop r14
pop r13
pop r12
pop rbp
ret
is_number:
xor rax, rax
ret
find_lexeme:
push rbp
mov rbp, rsp
push r12
xor r12, r12
; for i..NUM_LEXEMES {
.loop:
cmp r12, NUM_LEXEMES
jge .not_found
; let lexeme = LEXEMES[i];
lea rdi, [rel LEXEMES + r12*8]
lea rax, [rel cursor]
lea rsi, [rel buffer]
add rsi, rax
; let len = LEXEME_LENS[i];
lea rdx, [LEXEME_LENS + r12*8]
lea rcx, [rel buffer_len]
sub rcx, rdx
sub rcx, rax
jo .next
; if buffer[cursor..cursor+len] == lexeme {
call streq
test rax, rax
jne .next
; if is_ident() {
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else {
lea rax, [rel cursor]
; cursor += len;
lea rdi, [rel LEXEME_LENS + r12*8]
add rax, rdi
mov [rel cursor], rax
; return TOKENS[i];
lea rax, [rel TOKENS + r12*4]
jmp .epilogue
; }
.next:
inc r12
jmp .loop
; }
; }
.not_found:
; if is_ident() {
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else {
; return TOKEN_EOF;
mov rax, TOKEN_EOF
; }
.epilogue:
pop r12
pop rbp
ret
.is_ident:
mov rax, TOKEN_IDENT
jmp .epilogue
.is_number:
mov rax, TOKEN_NUMBER
jmp .epilogue
;; rcx: lexeme index
;; Returns:
;; rax: token if matched, 0 if not matched
try_lexeme:
push r8
push r15
push rcx ; save lexeme index
; compare lexeme with cursor
mov r9, [cursor]
mov r8, [buffer]
add r8, r9
mov rax, [buffer_len]
sub rax, r9
mov rdx, [LEXEME_LENS + rcx*8]
cmp rax, rdx
jl .not_equal
; compare memory
mov rcx, [LEXEMES + rcx*8]
mov r9, rdx
call streq
cmp rax, 1
jne .not_equal
; check if it could be an ident:
; the buffer must not have ended
; the next char must be id_continue
; first char must be id_start
mov rax, [cursor]
add rax, rdx
cmp rax, [buffer_len]
jge .not_ident
mov cl, [buffer + rax]
call is_id_continue
cmp rax, 1
jne .not_ident
; check first char
mov rax, [cursor]
mov cl, [buffer + rax]
call is_id_start
cmp rax, 1
jne .not_ident
; this is an ident
; move cursor forward while is_id_continue
mov r15, [cursor]
add r15, rdx
.try_lexeme_loop:
cmp r15, [buffer_len]
jge .done_ident
mov cl, [buffer + r15]
call is_id_continue
cmp rax, 1
jne .done_ident
inc r15
jmp .try_lexeme_loop
.done_ident:
mov [cursor], r15
pop rcx
pop r15
pop r8
mov rax, TOKEN_IDENT
ret
.not_ident:
mov rax, [cursor]
add rax, rdx
mov [cursor], rax
pop rcx
pop r15
pop r8
mov rax, rcx
ret
.not_equal:
pop rcx
pop r15
pop r8
xor rax, rax
ret
;; Returns: token enumerator
tokeniser_next_token:
; check if at end of buffer
.loop:
mov rax, [cursor]
cmp rax, [buffer_len]
jge .eof
; get next 4 bytes
mov rbx, [buffer]
add rbx, rax
mov bl, byte [rbx]
; skip whitespace
cmp bl, ' '
je .skip
mov r15, 1 ; lexeme index
.inner_loop:
cmp r15, NUM_LEXEMES
; TODO: numbers, idents
jge .skip
; try lexeme
mov rcx, r15
call try_lexeme
cmp rax, 0
jne .return_token
inc r15
jmp .inner_loop
.skip:
mov rax, [cursor]
inc rax
mov [cursor], rax
jmp .loop
.eof:
mov rax, TOKEN_EOF
ret
.return_token:
mov rax, r15
ret