section .text extern panic extern strlen extern strcmp extern streq extern memcpy extern eprint_str extern exit extern error_to_str extern eprint_error extern alloc_pages extern allocate extern fopen_read extern is_alpha extern is_numeric extern is_id_continue extern is_id_start extern is_whitespace global tokeniser_init global tokeniser_init_buf global tokeniser_print global find_lexeme global expect_token global unwrap_token global peek_expect_token global peek_lexeme global tokeniser_get_cursor global tokeniser_set_cursor ;; ============================= ;; Tokeniser functions ;; ============================= ;; tokeniser state section .data global input_file global buffer global cursor global buffer_len input_file dd 0 buffer dq 0 cursor dq 0 buffer_len dq 0 ;; each buffer is chunk_size bytes large ;; buffer header structure: ;; +0 (8 bytes): pointer buffer ;; +8 (8 bytes): size of buffer ;; Tokens: ;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ] section .bss statbuf: resb 144 section .text ;; rdi: pointer to buffer ;; rsi: length of buffer tokeniser_init_buf: push rbp mov rbp, rsp mov dword [rel input_file], 0 mov qword [rel buffer], rdi mov qword [rel buffer_len], rsi mov qword [rel cursor], 0 pop rbp ret ;; Initialises the tokeniser ;; rdx: pointer to filename (null-terminated) tokeniser_init: ; open file for reading ; this panics if the file doesn't exist call fopen_read mov dword [rel input_file], eax ; store file descriptor mov qword [rel cursor], 0 mov qword [rel buffer_len], 0 ; fstat mov rax, 5 ; syscall: fstat mov rdi, [rel input_file] ; fd lea rsi, [rel statbuf] ; statbuf syscall cmp rax, 0 jl .report_error ; get file size from statbuf lea r15, [rel statbuf] ; st_size mov r15, [r15 + 48] ; offset of st_size in stat struct ; allocate buffer mov rdi, r15 call allocate mov qword [rel buffer], rax mov qword [rel buffer_len], r15 ; read file into buffer mov rax, 0 ; syscall: read mov edi, [rel input_file] ; fd mov rsi, [rel buffer] ; buf mov rdx, [rel buffer_len] ; count syscall cmp rax, 0 jl .report_error ret .report_error: mov rcx, rax call eprint_error call panic section .rdata tokeniser_buffer db "Tokeniser buffer: ", 10 tokeniser_buffer_len equ $ - tokeniser_buffer section .text tokeniser_print: lea rdi, [rel tokeniser_buffer] mov rsi, tokeniser_buffer_len call eprint_str mov rax, [rel cursor] mov rdi, [rel buffer] add rdi, rax mov rsi, [rel buffer_len] call eprint_str ret section .rdata global LEXEMES global TOKENS global LEXEME_LENS global NUM_LEXEMES %include "src/tokeniser.inc" section .text ;; rdi: length of previously matched lexeme ;; returns the length of the ident ;; fn is_ident(lexeme_len: usize) -> usize is_ident: push rbp mov rbp, rsp push r12 push r13 push r14 push rdi mov rax, [rel cursor] mov r12, [rel buffer] mov r13, [rel buffer_len] sub r13, rax add r12, rax ; check first char is id_start mov dil, [r12] call is_id_start test rax, rax je .not_ident mov r14, 1 .loop: cmp r14, r13 jge .done mov dil, [r12 + r14] ; check for id_continue call is_id_continue test rax, rax je .done inc r14 jmp .loop .done: ; r14 is length of ident mov rdi, [rsp] cmp r14, rdi jle .not_ident mov rax, [rel cursor] add rax, r14 mov [rel cursor], rax mov rax, r14 jmp .epilogue .not_ident: xor rax, rax .epilogue: pop rdi pop r14 pop r13 pop r12 pop rbp ret is_comment: push rbp mov rbp, rsp push r12 push r13 push r14 mov rax, [rel cursor] mov r12, [rel buffer] mov r13, [rel buffer_len] add r12, rax sub r13, rax mov dil, [r12] cmp dil, '/' jne .not_comment mov r14, 1 cmp r14, r13 jge .not_comment mov dil, [r12 + r14] cmp dil, '/' jne .not_comment .loop: inc r14 cmp r14, r13 jge .comment mov dil, [r12 + r14] cmp dil, 10 ; newline jne .loop .comment: mov rax, [rel cursor] add rax, r14 mov [rel cursor], rax mov rax, r14 jmp .epilogue .not_comment: xor rax, rax .epilogue: pop r14 pop r13 pop r12 pop rbp ret ;; Strings are sequences of characters enclosed in double quotes ;; Strings span multiple lines, and may in the future contain escape sequences is_string: push rbp mov rbp, rsp push r12 push r13 push r14 mov rax, [rel cursor] mov r12, [rel buffer] mov r13, [rel buffer_len] add r12, rax sub r13, rax mov dil, [r12] cmp dil, '"' jne .not_string mov r14, 1 .loop: cmp r14, r13 jge .unterminated mov dil, [r12 + r14] cmp dil, '"' je .string cmp dil, 0x5c ; backslash je .escape inc r14 jmp .loop .escape: inc r14 cmp r14, r13 jge .unterminated inc r14 jmp .loop .string: mov rax, [rel cursor] inc r14 ; include closing quote add rax, r14 mov [rel cursor], rax mov rax, r14 jmp .epilogue .unterminated: ;; TODO: report unterminated string error mov rax, r14 jmp .epilogue .not_string: xor rax, rax .epilogue: pop r14 pop r13 pop r12 pop rbp ret ;; Numbers are sequences of numeric characters, interspersed with underscores ;; The leading character must be numeric ;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary. is_number: push rbp mov rbp, rsp push r12 push r13 push r14 mov rax, [rel cursor] mov r12, [rel buffer] mov r13, [rel buffer_len] add r12, rax sub r13, rax mov dil, [r12] call is_numeric test rax, rax je .not_number mov r14, 1 .loop: cmp r14, r13 jge .number mov dil, [r12 + r14] cmp dil, '_' je .loop_next call is_numeric test rax, rax je .number .loop_next: inc r14 jmp .loop .number: mov rax, [rel cursor] add rax, r14 mov [rel cursor], rax mov rax, r14 jmp .epilogue .not_number: xor rax, rax .epilogue: pop r14 pop r13 pop r12 pop rbp ret skip_whitespaces: push rbp mov rbp, rsp push r12 push r13 push r14 ; let start = buffer.add(cursor); ; let end = buffer.add(buffer_len); mov r12, [rel cursor] mov r13, [rel buffer_len] mov r14, [rel buffer] ; for ptr in start..end { .loop: cmp r12, r13 jge .done mov dil, [r14 + r12] call is_whitespace test rax, rax je .done inc r12 jmp .loop .done: mov [rel cursor], r12 pop r14 pop r13 pop r12 pop rbp ret ;; rdi: pointer to out-struct ;; fn find_lexeme() -> (u8, *const u8, usize) find_lexeme: push rbp mov rbp, rsp push rdi ; skip whitespaces call skip_whitespaces ;; init out struct mov rdi, [rsp] mov rax, [rel buffer] add rax, [rel cursor] mov qword [rdi], 0 mov [rdi + 8], rax mov qword [rdi + 16], 0 ; check length mov rax, [rel cursor] mov rcx, [rel buffer_len] ; if cursor >= buffer_len { cmp rax, rcx jge .eof jmp .start .eof: ; return TOKEN_EOF; mov rax, TOKEN_EOF pop rdi pop rbp ret ; } .start: push r12 ; test special tokens: ; if buffer[cursor] == '"' { call is_string test rax, rax jne .is_string ; } else if buffer[cursor].is_numeric() { call is_number ; return is_number(); test rax, rax jne .is_number ; } else if buffer[cursor..][..2] == "//" { call is_comment ; // skip to end of line test rax, rax jne .is_comment ; } .loop_init: mov r12, 1 ; for 1..NUM_LEXEMES { .loop: cmp r12, [rel NUM_LEXEMES] jge .not_found ; let lexeme = LEXEMES[i]; lea rdi, [rel LEXEMES] mov rdi, [rdi + r12*8] lea rdx, [rel LEXEME_LENS] mov rsi, [rdx + r12*8] mov rax, [rel cursor] mov rdx, [rel buffer] add rdx, rax ; let len = LEXEME_LENS[i]; mov rcx, [rel buffer_len] sub rcx, rax jo .not_found ; if lexeme.len() > buffer.len() - cursor { cmp rsi, rcx jg .next ; continue; ; } mov rcx, rsi ; if buffer[cursor..cursor+len] == lexeme { call streq test rax, rax je .next ; if is_ident() { mov rdi, rsi call is_ident test rax, rax ; return TOKEN_IDENT; jne .is_ident ; } else { mov rdi, [rsp + 8] mov rax, [rel cursor] ; cursor += len; lea rsi, [rel LEXEME_LENS] mov rsi, [rsi + r12*8] add rax, rsi mov [rel cursor], rax ; return TOKENS[i]; lea rax, [rel TOKENS] mov al, [rax + r12] and rax, 0xFF mov rdi, [rsp + 8] mov [rdi], al mov [rdi + 16], rsi jmp .epilogue ; } .next: inc r12 jmp .loop ; } ; } .not_found: ; if is_ident() { xor rdi, rdi call is_ident test rax, rax ; return TOKEN_IDENT; jne .is_ident ; } else { ; return TOKEN_EOF; mov rdi, [rsp + 8] mov qword [rdi], TOKEN_EOF ; } .epilogue: pop r12 pop rdi pop rbp mov rax, rdi ret .is_ident: ; rax = len ; out.0 = TOKEN_IDENT ; out.1 = buffer.add(cursor - len) ; out.2 = len mov rdi, [rsp + 8] mov qword [rdi], TOKEN_IDENT mov [rdi + 16], rax jmp .epilogue .is_number: mov rdi, [rsp + 8] mov qword [rdi], TOKEN_NUMBER mov [rdi + 16], rax jmp .epilogue .is_string: mov rdi, [rsp + 8] mov qword [rdi], TOKEN_STRING mov [rdi + 16], rax jmp .epilogue .is_comment: mov rdi, [rsp + 8] mov qword [rdi], TOKEN_COMMENT mov [rdi + 16], rax jmp .epilogue ;; dil: expected token expect_token: push rbp mov rbp, rsp sub rsp, 0x30 mov [rsp], dil mov rax, [rel cursor] ; current cursor mov [rsp + 8], rax lea rdi, [rsp + 0x10] call find_lexeme mov rax, [rsp + 0x10] ; found token mov dil, [rsp] ; expected token cmp al, dil je .matched mov rdi, [rsp + 8] ; restore cursor mov [rel cursor], rdi ; restore cursor xor rax, rax xor rdx, rdx jmp .epilogue .matched: mov rax, [rsp + 0x18] ; lexeme pointer mov rdx, [rsp + 0x20] ; lexeme length .epilogue: add rsp, 0x30 pop rbp ret ;; dil: expected token unwrap_token: push rbp mov rbp, rsp call expect_token test rax, rax jz .panic pop rbp ret .panic: call panic ;; returns 0 if token not found, else returns lexeme (ptr, len) ;; dil: expected token peek_expect_token: push rbp mov rbp, rsp mov rax, [rel cursor] push rax call expect_token pop rdi mov [rel cursor], rdi pop rbp ret ;; rdi: out-struct pointer peek_lexeme: push rbp mov rbp, rsp push rdi mov rax, [rel cursor] ; current cursor push rax call find_lexeme pop rdi mov [rel cursor], rdi ; restore cursor pop rax pop rbp ret tokeniser_get_cursor: mov rax, [rel cursor] ret tokeniser_set_cursor: mov [rel cursor], rdi ret