;; Compile with: ;; nasm -f elf64 main.asm -o main.o extern int_to_str extern oom extern panic extern strlen extern strcmp extern streq extern memcpy extern eprint_str extern exit extern error_to_str extern eprint_error extern alloc_pages extern allocate extern fopen_read extern is_alpha extern is_numeric extern is_id_continue extern is_id_start section .data hello_msg db "Hello, World!", 10 hello_msg_len equ $ - hello_msg file_error_msg db "Could not open file: " file_error_msg_len equ $ - file_error_msg test_success db "All tests passed!", 10 test_success_len equ $ - test_success section .text global _start _start: mov rdi, test_success mov rsi, test_success_len call eprint_str mov rdi, 0 call exit compiler_entry: ; get filename from argv[1] ; argv is at rsp + 8 ; check if argc > 1 mov rcx, hello_msg mov rdx, hello_msg_len call eprint_str mov rax, [rsp] ; argc cmp rax, 1 jle .no_filename ; if argc <= 1, no filename provided ; get argv[1] mov rax, [rsp + 16] ; argv[1] ; init tokeniser mov rdx, rax ; rdx = pointer to filename call tokeniser_init call tokeniser_print .loop: call tokeniser_next_token cmp rax, 0 je .exit mov rcx, [LEXEMES + rax*8] mov rdx, [LEXEME_LENS + rax*8] call eprint_str jmp .loop .no_filename: call panic .exit: call exit ;; ============================= ;; Tokeniser functions ;; ============================= ;; tokeniser state section .data input_file dd 0 buffer dq 0 cursor dq 0 buffer_len dq 0 ;; each buffer is chunk_size bytes large ;; buffer header structure: ;; +0 (8 bytes): pointer buffer ;; +8 (8 bytes): size of buffer ;; Tokens: ;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ] section .bss statbuf: resb 144 section .text ;; Initialises the tokeniser ;; rdx: pointer to filename (null-terminated) tokeniser_init: ; open file for reading ; this panics if the file doesn't exist call fopen_read mov [input_file], eax ; store file descriptor mov dword [cursor], 0 mov dword [buffer_len], 0 ; fstat mov rax, 5 ; syscall: fstat mov rdi, [input_file] ; fd lea rsi, [statbuf] ; statbuf syscall cmp rax, 0 jl .report_error ; get file size from statbuf mov r15, [statbuf + 48] ; st_size ; allocate buffer mov rcx, r15 call allocate mov [buffer], rax mov [buffer_len], r15 ; read file into buffer mov rax, 0 ; syscall: read mov rdi, [input_file] ; fd mov rsi, [buffer] ; buf mov rdx, [buffer_len] ; count syscall cmp rax, 0 jl .report_error ret .report_error: mov rcx, rax call eprint_error call panic section .rdata tokeniser_buffer db "Tokeniser buffer: ", 10 tokeniser_buffer_len equ $ - tokeniser_buffer section .text tokeniser_print: mov rcx, tokeniser_buffer mov rdx, tokeniser_buffer_len call eprint_str mov rax, [cursor] mov rcx, [buffer + rax] mov rdx, [buffer_len] call eprint_str section .rdata LEXEMES: dq \ LEX_NOT_A_LEXEME, \ LEX_LET, \ LEX_IF, \ LEX_ELSE, \ LEX_FN, \ LEX_RETURN, \ LEX_LOOP, \ LEX_BREAK, \ LEX_CONTINUE, \ LEX_TRUE, \ LEX_FALSE, \ LEX_BOOL, \ LEX_ARROW, \ LEX_I32, \ LEX_U32, \ LEX_EQUALS, \ LEX_PLUS, \ LEX_MINUS, \ LEX_RPARENS, \ LEX_LPARENS, \ LEX_RBRACE, \ LEX_LBRACE, \ LEX_COLON, \ LEX_SEMI, \ LEX_COMMA, \ LEX_PIPE, \ LEX_AMP, \ LEX_EQEQ TOKENS: db \ TOKEN_EOF, \ TOKEN_LET, \ TOKEN_IF, \ TOKEN_ELSE, \ TOKEN_FN, \ TOKEN_RETURN, \ TOKEN_LOOP, \ TOKEN_BREAK, \ TOKEN_CONTINUE, \ TOKEN_TRUE, \ TOKEN_FALSE, \ TOKEN_BOOL, \ TOKEN_ARROW, \ TOKEN_I32, \ TOKEN_U32, \ TOKEN_EQUALS, \ TOKEN_PLUS, \ TOKEN_MINUS, \ TOKEN_RPARENS, \ TOKEN_LPARENS, \ TOKEN_RBRACE, \ TOKEN_LBRACE, \ TOKEN_COLON, \ TOKEN_SEMI, \ TOKEN_COMMA, \ TOKEN_PIPE, \ TOKEN_AMP, \ TOKEN_EQEQ LEXEME_LENS: dq \ 0, \ LEX_LET_len, \ LEX_IF_len, \ LEX_ELSE_len, \ LEX_FN_len, \ LEX_RETURN_len, \ LEX_LOOP_len, \ LEX_BREAK_len, \ LEX_CONTINUE_len, \ LEX_TRUE_len, \ LEX_FALSE_len, \ LEX_BOOL_len, \ LEX_ARROW_len, \ LEX_I32_len, \ LEX_U32_len, \ LEX_EQUALS_len, \ LEX_PLUS_len, \ LEX_MINUS_len, \ LEX_RPARENS_len, \ LEX_LPARENS_len, \ LEX_RBRACE_len, \ LEX_LBRACE_len, \ LEX_COLON_len, \ LEX_SEMI_len, \ LEX_COMMA_len, \ LEX_PIPE_len, \ LEX_AMP_len, \ LEX_EQEQ_len NUM_LEXEMES equ 28 LEX_NOT_A_LEXEME db "", 0 TOKEN_EOF equ 0 TOKEN_LET equ 1 LEX_LET db "let" LEX_LET_len equ $ - LEX_LET TOKEN_IF equ 2 LEX_IF db "if" LEX_IF_len equ $ - LEX_IF TOKEN_ELSE equ 3 LEX_ELSE db "else" LEX_ELSE_len equ $ - LEX_ELSE TOKEN_FN equ 4 LEX_FN db "fn" LEX_FN_len equ $ - LEX_FN TOKEN_RETURN equ 5 LEX_RETURN db "return" LEX_RETURN_len equ $ - LEX_RETURN TOKEN_LOOP equ 6 LEX_LOOP db "loop" LEX_LOOP_len equ $ - LEX_LOOP TOKEN_BREAK equ 7 LEX_BREAK db "break" LEX_BREAK_len equ $ - LEX_BREAK TOKEN_CONTINUE equ 8 LEX_CONTINUE db "continue" LEX_CONTINUE_len equ $ - LEX_CONTINUE TOKEN_TRUE equ 9 LEX_TRUE db "true" LEX_TRUE_len equ $ - LEX_TRUE TOKEN_FALSE equ 10 LEX_FALSE db "false" LEX_FALSE_len equ $ - LEX_FALSE TOKEN_BOOL equ 11 LEX_BOOL db "bool" LEX_BOOL_len equ $ - LEX_BOOL TOKEN_ARROW equ 12 LEX_ARROW db "->" LEX_ARROW_len equ $ - LEX_ARROW TOKEN_I32 equ 13 LEX_I32 db "i32" LEX_I32_len equ $ - LEX_I32 TOKEN_U32 equ 14 LEX_U32 db "u32" LEX_U32_len equ $ - LEX_U32 TOKEN_EQUALS equ 15 LEX_EQUALS db "=" LEX_EQUALS_len equ $ - LEX_EQUALS TOKEN_PLUS equ 16 LEX_PLUS db "+" LEX_PLUS_len equ $ - LEX_PLUS TOKEN_MINUS equ 17 LEX_MINUS db "-" LEX_MINUS_len equ $ - LEX_MINUS TOKEN_RPARENS equ 18 LEX_RPARENS db ")" LEX_RPARENS_len equ $ - LEX_RPARENS TOKEN_LPARENS equ 19 LEX_LPARENS db "(" LEX_LPARENS_len equ $ - LEX_LPARENS TOKEN_RBRACE equ 20 LEX_RBRACE db "}" LEX_RBRACE_len equ $ - LEX_RBRACE TOKEN_LBRACE equ 21 LEX_LBRACE db "{" LEX_LBRACE_len equ $ - LEX_LBRACE TOKEN_COLON equ 22 LEX_COLON db ":" LEX_COLON_len equ $ - LEX_COLON TOKEN_SEMI equ 23 LEX_SEMI db ";" LEX_SEMI_len equ $ - LEX_SEMI TOKEN_COMMA equ 24 LEX_COMMA db "," LEX_COMMA_len equ $ - LEX_COMMA TOKEN_PIPE equ 25 LEX_PIPE db "|" LEX_PIPE_len equ $ - LEX_PIPE TOKEN_AMP equ 26 LEX_AMP db "&" LEX_AMP_len equ $ - LEX_AMP TOKEN_EQEQ equ 27 LEX_EQEQ db "==" LEX_EQEQ_len equ $ - LEX_EQEQ TOKEN_IDENT equ 28 LEX_IDENT db "" LEX_IDENT_len equ $ - LEX_IDENT TOKEN_NUMBER equ 29 LEX_NUMBER db "" LEX_NUMBER_len equ $ - LEX_NUMBER section .text ;; rdi: length of matched lexeme is_ident: push rbp mov rbp, rsp push r12 push r13 push r14 push rdi lea rax, [rel cursor] lea r12, [rel buffer] lea r13, [rel buffer_len] sub r13, rax add r12, rax ; check first char is id_start mov dil, [r12] call is_id_start test rax, rax je .not_ident xor r14, r14 .loop: cmp r14, r13 jge .done mov dil, [r12 + r14] ; check for id_continue call is_id_continue test rax, rax je .done inc r14 jmp .loop .done: ; r14 is length of ident mov rdi, [rsp] cmp r14, rdi jle .not_ident lea rax, [rel cursor] add rax, r14 mov [rel cursor], rax mov rax, 1 jmp .epilogue .not_ident: xor rax, rax .epilogue: pop rdi pop r14 pop r13 pop r12 pop rbp ret is_number: xor rax, rax ret find_lexeme: push rbp mov rbp, rsp push r12 xor r12, r12 ; for i..NUM_LEXEMES { .loop: cmp r12, NUM_LEXEMES jge .not_found ; let lexeme = LEXEMES[i]; lea rdi, [rel LEXEMES + r12*8] lea rax, [rel cursor] lea rsi, [rel buffer] add rsi, rax ; let len = LEXEME_LENS[i]; lea rdx, [LEXEME_LENS + r12*8] lea rcx, [rel buffer_len] sub rcx, rdx sub rcx, rax jo .next ; if buffer[cursor..cursor+len] == lexeme { call streq test rax, rax jne .next ; if is_ident() { call is_ident test rax, rax ; return TOKEN_IDENT; jne .is_ident ; } else if is_number() { call is_number test rax, rax ; return TOKEN_NUMBER; jne .is_number ; } else { lea rax, [rel cursor] ; cursor += len; lea rdi, [rel LEXEME_LENS + r12*8] add rax, rdi mov [rel cursor], rax ; return TOKENS[i]; lea rax, [rel TOKENS + r12*4] jmp .epilogue ; } .next: inc r12 jmp .loop ; } ; } .not_found: ; if is_ident() { call is_ident test rax, rax ; return TOKEN_IDENT; jne .is_ident ; } else if is_number() { call is_number test rax, rax ; return TOKEN_NUMBER; jne .is_number ; } else { ; return TOKEN_EOF; mov rax, TOKEN_EOF ; } .epilogue: pop r12 pop rbp ret .is_ident: mov rax, TOKEN_IDENT jmp .epilogue .is_number: mov rax, TOKEN_NUMBER jmp .epilogue ;; rcx: lexeme index ;; Returns: ;; rax: token if matched, 0 if not matched try_lexeme: push r8 push r15 push rcx ; save lexeme index ; compare lexeme with cursor mov r9, [cursor] mov r8, [buffer] add r8, r9 mov rax, [buffer_len] sub rax, r9 mov rdx, [LEXEME_LENS + rcx*8] cmp rax, rdx jl .not_equal ; compare memory mov rcx, [LEXEMES + rcx*8] mov r9, rdx call streq cmp rax, 1 jne .not_equal ; check if it could be an ident: ; the buffer must not have ended ; the next char must be id_continue ; first char must be id_start mov rax, [cursor] add rax, rdx cmp rax, [buffer_len] jge .not_ident mov cl, [buffer + rax] call is_id_continue cmp rax, 1 jne .not_ident ; check first char mov rax, [cursor] mov cl, [buffer + rax] call is_id_start cmp rax, 1 jne .not_ident ; this is an ident ; move cursor forward while is_id_continue mov r15, [cursor] add r15, rdx .try_lexeme_loop: cmp r15, [buffer_len] jge .done_ident mov cl, [buffer + r15] call is_id_continue cmp rax, 1 jne .done_ident inc r15 jmp .try_lexeme_loop .done_ident: mov [cursor], r15 pop rcx pop r15 pop r8 mov rax, TOKEN_IDENT ret .not_ident: mov rax, [cursor] add rax, rdx mov [cursor], rax pop rcx pop r15 pop r8 mov rax, rcx ret .not_equal: pop rcx pop r15 pop r8 xor rax, rax ret ;; Returns: token enumerator tokeniser_next_token: ; check if at end of buffer .loop: mov rax, [cursor] cmp rax, [buffer_len] jge .eof ; get next 4 bytes mov rbx, [buffer] add rbx, rax mov bl, byte [rbx] ; skip whitespace cmp bl, ' ' je .skip mov r15, 1 ; lexeme index .inner_loop: cmp r15, NUM_LEXEMES ; TODO: numbers, idents jge .skip ; try lexeme mov rcx, r15 call try_lexeme cmp rax, 0 jne .return_token inc r15 jmp .inner_loop .skip: mov rax, [cursor] inc rax mov [cursor], rax jmp .loop .eof: mov rax, TOKEN_EOF ret .return_token: mov rax, r15 ret