From f1faac639c036dcd1bd0145362e405230cc0d947 Mon Sep 17 00:00:00 2001 From: janis Date: Tue, 21 Oct 2025 00:36:27 +0200 Subject: [PATCH] rework tokeniser --- lang/src/lib.asm | 17 +++ lang/src/main.asm | 259 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 234 insertions(+), 42 deletions(-) diff --git a/lang/src/lib.asm b/lang/src/lib.asm index 8b1a1ab..dcdf628 100644 --- a/lang/src/lib.asm +++ b/lang/src/lib.asm @@ -389,3 +389,20 @@ is_id_start: .is_ret: mov rax, 1 ret + +;; dil: byte to check +is_whitespace: + cmp dil, ' ' + je .is_ws + cmp dil, 9 ; tab + je .is_ws + cmp dil, 10 ; newline + je .is_ws + cmp dil, 13 ; carriage return + je .is_ws + xor rax, rax + ret +.is_ws: + mov rax, 1 + ret + diff --git a/lang/src/main.asm b/lang/src/main.asm index a07cbf8..1fc2a84 100644 --- a/lang/src/main.asm +++ b/lang/src/main.asm @@ -73,10 +73,9 @@ compiler_entry: ;; Opens file for reading: -;; rdx: pointer to filename (null-terminated) +;; rdi: pointer to filename (null-terminated) fopen_read: mov rax, 2 ; syscall: open - mov rdi, rdx ; filename mov rsi, 0 ; flags: O_RDONLY mov rdx, 0 ; mode syscall @@ -86,46 +85,25 @@ fopen_read: .file_error: push rdi - mov rcx, rax + mov rdi, rax call eprint_error - pop rdi - mov rcx, rdi ; filename is in rdi - call strlen ; get length of filename - mov r9, rax ; r9 = filename length - mov rsi, r9 - add rsi, file_error_msg_len ; + prefix - add rsi, 1 ; + newline - add rsi, 15 - and rsi, -16 ; align up to 16 - sub rsp, rsi ; allocate buffer - push rsi ; save allocation size - - ; copy file_error_msg - lea rcx, [rsp + 8] - mov rdx, file_error_msg - mov r8, file_error_msg_len - call memcpy - - ; copy filename - lea rcx, [rsp + 8 + file_error_msg_len] - mov rdx, rdi - mov r8, r9 - call memcpy - - ; trailing newline - lea rdx, [rsp + 8 + file_error_msg_len + r9] - mov byte [rdx], 10 - - ; print error message - lea rcx, [rsp + 8] - mov rdx, file_error_msg_len - add rdx, r9 - add rdx, 1 ; include newline + lea rdi, [rel file_error_msg] + lea rsi, [rel file_error_msg_len] call eprint_str - pop rsi - add rsp, rsi ; dealloc + pop rdi + call strlen ; get length of filename + mov rsi, rax ; r9 = filename length + call eprint_str + + mov rdi, 10 + push rdi + mov rdi, rsp + mov rsi, 1 + call eprint_str + pop rdi + call panic ;; ============================= @@ -212,23 +190,89 @@ LEXEMES: dq \ LEX_IF, \ LEX_ELSE, \ LEX_FN, \ - LEX_ARROW + LEX_RETURN, \ + LEX_LOOP, \ + LEX_BREAK, \ + LEX_CONTINUE, \ + LEX_TRUE, \ + LEX_FALSE, \ + LEX_BOOL, \ + LEX_ARROW, \ + LEX_I32, \ + LEX_U32, \ + LEX_EQUALS, \ + LEX_PLUS, \ + LEX_MINUS, \ + LEX_RPARENS, \ + LEX_LPARENS, \ + LEX_RBRACE, \ + LEX_LBRACE, \ + LEX_COLON, \ + LEX_SEMI, \ + LEX_COMMA, \ + LEX_PIPE, \ + LEX_AMP, \ + LEX_EQEQ TOKENS: db \ TOKEN_EOF, \ TOKEN_LET, \ TOKEN_IF, \ TOKEN_ELSE, \ TOKEN_FN, \ - TOKEN_ARROW + TOKEN_RETURN, \ + TOKEN_LOOP, \ + TOKEN_BREAK, \ + TOKEN_CONTINUE, \ + TOKEN_TRUE, \ + TOKEN_FALSE, \ + TOKEN_BOOL, \ + TOKEN_ARROW, \ + TOKEN_I32, \ + TOKEN_U32, \ + TOKEN_EQUALS, \ + TOKEN_PLUS, \ + TOKEN_MINUS, \ + TOKEN_RPARENS, \ + TOKEN_LPARENS, \ + TOKEN_RBRACE, \ + TOKEN_LBRACE, \ + TOKEN_COLON, \ + TOKEN_SEMI, \ + TOKEN_COMMA, \ + TOKEN_PIPE, \ + TOKEN_AMP, \ + TOKEN_EQEQ LEXEME_LENS: dq \ 0, \ LEX_LET_len, \ LEX_IF_len, \ LEX_ELSE_len, \ LEX_FN_len, \ - LEX_ARROW_len + LEX_RETURN_len, \ + LEX_LOOP_len, \ + LEX_BREAK_len, \ + LEX_CONTINUE_len, \ + LEX_TRUE_len, \ + LEX_FALSE_len, \ + LEX_BOOL_len, \ + LEX_ARROW_len, \ + LEX_I32_len, \ + LEX_U32_len, \ + LEX_EQUALS_len, \ + LEX_PLUS_len, \ + LEX_MINUS_len, \ + LEX_RPARENS_len, \ + LEX_LPARENS_len, \ + LEX_RBRACE_len, \ + LEX_LBRACE_len, \ + LEX_COLON_len, \ + LEX_SEMI_len, \ + LEX_COMMA_len, \ + LEX_PIPE_len, \ + LEX_AMP_len, \ + LEX_EQEQ_len - NUM_LEXEMES equ 5 + NUM_LEXEMES equ 28 LEX_NOT_A_LEXEME db "", 0 TOKEN_EOF equ 0 @@ -321,6 +365,137 @@ LEXEME_LENS: dq \ LEX_NUMBER_len equ $ - LEX_NUMBER section .text + +;; rdi: length of matched lexeme +is_ident: + push rbp + mov rbp, rsp + push r12 + push r13 + push r14 + push rdi + lea rax, [rel cursor] + lea r12, [rel buffer] + lea r13, [rel buffer_len] + sub r13, rax + add r12, rax + + ; check first char is id_start + mov dil, [r12] + call is_id_start + test rax, rax + je .not_ident + xor r14, r14 +.loop: + cmp r14, r13 + jge .done + mov dil, [r12 + r14] + ; check for id_continue + call is_id_continue + test rax, rax + je .done + inc r14 + jmp .loop +.done: + ; r14 is length of ident + mov rdi, [rsp] + cmp r14, rdi + jle .not_ident + lea rax, [rel cursor] + add rax, r14 + mov [rel cursor], rax + mov rax, 1 + jmp .epilogue +.not_ident: + xor rax, rax +.epilogue: + pop rdi + pop r14 + pop r13 + pop r12 + pop rbp + ret + +is_number: + xor rax, rax + ret + +find_lexeme: + push rbp + mov rbp, rsp + push r12 + xor r12, r12 + ; for i..NUM_LEXEMES { +.loop: + cmp r12, NUM_LEXEMES + jge .not_found + ; let lexeme = LEXEMES[i]; + lea rdi, [rel LEXEMES + r12*8] + lea rax, [rel cursor] + lea rsi, [rel buffer] + add rsi, rax + ; let len = LEXEME_LENS[i]; + lea rdx, [LEXEME_LENS + r12*8] + lea rcx, [rel buffer_len] + sub rcx, rdx + sub rcx, rax + jo .next + ; if buffer[cursor..cursor+len] == lexeme { + call streq + test rax, rax + jne .next + ; if is_ident() { + call is_ident + test rax, rax + ; return TOKEN_IDENT; + jne .is_ident + ; } else if is_number() { + call is_number + test rax, rax + ; return TOKEN_NUMBER; + jne .is_number + ; } else { + lea rax, [rel cursor] + ; cursor += len; + lea rdi, [rel LEXEME_LENS + r12*8] + add rax, rdi + mov [rel cursor], rax + ; return TOKENS[i]; + lea rax, [rel TOKENS + r12*4] + jmp .epilogue + ; } +.next: + inc r12 + jmp .loop + ; } + ; } +.not_found: + ; if is_ident() { + call is_ident + test rax, rax + ; return TOKEN_IDENT; + jne .is_ident + ; } else if is_number() { + call is_number + test rax, rax + ; return TOKEN_NUMBER; + jne .is_number + ; } else { + ; return TOKEN_EOF; + mov rax, TOKEN_EOF + ; } +.epilogue: + pop r12 + pop rbp + ret +.is_ident: + mov rax, TOKEN_IDENT + jmp .epilogue +.is_number: + mov rax, TOKEN_NUMBER + jmp .epilogue + + ;; rcx: lexeme index ;; Returns: ;; rax: token if matched, 0 if not matched