diff --git a/lang/src/lib.asm b/lang/src/lib.asm index fea6f52..2637de6 100644 --- a/lang/src/lib.asm +++ b/lang/src/lib.asm @@ -13,6 +13,8 @@ section .text global oom global panic global strlen +global strcmp +global streq global memcpy global eprint_str global exit @@ -20,6 +22,11 @@ global error_to_str global eprint_error global alloc_pages +global is_alpha +global is_numeric +global is_id_continue +global is_id_start + ;; ============================== ;; Helper functions ;; ============================== @@ -73,6 +80,96 @@ strlen: .strlen_done: ret +;; Checks two byte slices for equality +;; rcx: pointer to first slice +;; rdx: length of first slice +;; r8: pointer to second slice +;; r9: length of second slice +;; returns: 1 if equal, 0 if not equal +streq: + cmp rdx, r9 + jne .not_equal ; lengths differ + + xor r10, r10 ; index = 0 +.loop: + cmp r10, rdx + jge .equal ; done all bytes + + mov al, [rcx + r10] ; char from a + mov bl, [r8 + r10] ; char from b + cmp al, bl + jne .not_equal ; chars differ + + inc r10 + jmp .loop +.equal: + mov rax, 1 ; equal + ret +.not_equal: + xor rax, rax ; not equal + ret + + +;; Compares two byte slices +;; rcx: pointer to first slice +;; rdx: length of first slice +;; r8: pointer to second slice +;; r9: length of second slice +; returns: -1, 0, or 1 in rax +strcmp: + xor rax, rax ; result = 0 (assume equal) + test rdx, rdx + jz .check_empty_b ; if len a == 0 + test r9, r9 + jz .check_empty_a ; if len b == 0 + + mov r10, rdx ; min(len a, len b) in r10 + cmp r9, rdx + cmovb r10, r9 + + xor r11, r11 ; index = 0 +.loop: + cmp r11, r10 + jge .after_loop ; done min(len a, len b) bytes + + mov al, [rcx + r11] ; char from a + mov bl, [r8 + r11] ; char from b + cmp al, bl + jb .less ; if al < bl: return -1 + ja .greater ; if al > bl: return 1 + + inc r11 + jmp .loop + +.after_loop: + cmp rdx, r9 + je .equal ; lengths equal, strings equal + jb .less ; a shorter than b + ja .greater ; a longer than b + +.equal: + xor rax, rax ; 0 + ret + +.less: + mov rax, -1 + ret + +.greater: + mov rax, 1 + ret + +.check_empty_a: + test rdx, rdx + jz .equal ; both empty + jmp .greater ; a not empty, b empty + +.check_empty_b: + test r9, r9 + jz .equal ; both empty + jmp .less ; b not empty, a empty + + ;; Copy bytes from one memory location to another ;; rcx: destination pointer ;; rdx: source pointer @@ -193,3 +290,62 @@ alloc_pages: mov rcx, rax ; error code call eprint_error call oom + +;; Returns 1 if cl is an ASCII alphabetic character, 0 otherwise +;; cl: byte to check +is_alpha: + cmp cl, 'A' + jb .not_alpha + cmp cl, 'Z' + jbe .is_alpha_ret + cmp cl, 'a' + jb .not_alpha + cmp cl, 'z' + jbe .is_alpha_ret +.is_alpha_ret: + mov rax, 1 + ret +.not_alpha: + xor rax, rax + ret + +is_numeric: + cmp cl, '0' + jb .not_numeric + cmp cl, '9' + jbe .is_numeric_ret +.is_numeric_ret: + mov rax, 1 + ret +.not_numeric: + xor rax, rax + ret + +;; cl: byte to check +is_id_continue: + call is_alpha + cmp rax, 1 + je .is_id_continue_ret + call is_numeric + cmp rax, 1 + je .is_id_continue_ret + cmp cl, '_' + je .is_id_continue_ret + xor rax, rax + ret +.is_id_continue_ret: + mov rax, 1 + ret + +;; cl: byte to check +is_id_start: + call is_alpha + cmp rax, 1 + je .is_ret + cmp cl, '_' + je .is_ret + xor rax, rax + ret +.is_ret: + mov rax, 1 + ret diff --git a/lang/src/main.asm b/lang/src/main.asm index 432cba5..e7e6341 100644 --- a/lang/src/main.asm +++ b/lang/src/main.asm @@ -4,6 +4,8 @@ extern int_to_str extern oom extern panic extern strlen +extern strcmp +extern streq extern memcpy extern eprint_str extern exit @@ -11,6 +13,11 @@ extern error_to_str extern eprint_error extern alloc_pages +extern is_alpha +extern is_numeric +extern is_id_continue +extern is_id_start + section .data hello_msg db "Hello, World!", 10 hello_msg_len equ $ - hello_msg @@ -19,14 +26,6 @@ section .data section .text global _start -test_fn: - push r11 - push r10 - pop r10 - pop r11 - ret - - _start: ; get filename from argv[1] ; argv is at rsp + 8 @@ -44,7 +43,14 @@ _start: mov rdx, rax ; rdx = pointer to filename call tokeniser_init call tokeniser_print - jmp .exit +.loop: + call tokeniser_next_token + cmp rax, 0 + je .exit + mov rcx, [LEXEMES + rax*8] + mov rdx, [LEXEME_LENS + rax*8] + call eprint_str + jmp .loop .no_filename: call panic @@ -186,3 +192,185 @@ tokeniser_print: mov rcx, [buffer + rax] mov rdx, [buffer_len] call eprint_str + +section .rdata +LEXEMES: dq \ + LEX_NOT_A_LEXEME, \ + LEX_LET, \ + LEX_IF, \ + LEX_ELSE, \ + LEX_FN, \ + LEX_ARROW +TOKENS: db \ + TOKEN_EOF, \ + TOKEN_LET, \ + TOKEN_IF, \ + TOKEN_ELSE, \ + TOKEN_FN, \ + TOKEN_ARROW +LEXEME_LENS: dq \ + 0, \ + LEX_LET_len, \ + LEX_IF_len, \ + LEX_ELSE_len, \ + LEX_FN_len, \ + LEX_ARROW_len + + NUM_LEXEMES equ 5 + + LEX_NOT_A_LEXEME db "", 0 + TOKEN_EOF equ 0 + TOKEN_LET equ 1 + LEX_LET db "let" + LEX_LET_len equ $ - LEX_LET + TOKEN_IF equ 2 + LEX_IF db "if" + LEX_IF_len equ $ - LEX_IF + TOKEN_ELSE equ 3 + LEX_ELSE db "else" + LEX_ELSE_len equ $ - LEX_ELSE + TOKEN_FN equ 4 + LEX_FN db "fn" + LEX_FN_len equ $ - LEX_FN + TOKEN_RETURN equ 5 + TOKEN_LOOP equ 6 + TOKEN_BREAK equ 7 + TOKEN_CONTINUE equ 8 + TOKEN_TRUE equ 9 + TOKEN_FALSE equ 10 + TOKEN_BOOL equ 11 + TOKEN_ARROW equ 12 + LEX_ARROW db "->" + LEX_ARROW_len equ $ - LEX_ARROW + TOKEN_I32 equ 13 + TOKEN_U32 equ 14 + TOKEN_EQUALS equ 15 + TOKEN_PLUS equ 16 + TOKEN_MINUS equ 17 + TOKEN_RPARENS equ 18 + TOKEN_LPARENS equ 19 + TOKEN_RBRACE equ 20 + TOKEN_LBRACE equ 21 + TOKEN_COLON equ 22 + TOKEN_SEMI equ 23 + TOKEN_COMMA equ 24 + TOKEN_PIPE equ 25 + TOKEN_AMP equ 26 + TOKEN_IDENT equ 27 + TOKEN_NUMBER equ 28 + +section .text +;; rcx: pointer to lexeme +;; rdx: length of lexeme +;; r8: token to return if matched +;; Returns: +;; rax: token if matched, 0 if not matched +try_lexeme: + push r8 + push r15 + ; compare lexeme with cursor + mov r9, [cursor] + mov r8, [buffer] + add r8, r9 + mov rax, [buffer_len] + sub rax, r9 + cmp rax, rdx + jl .not_equal + ; compare memory + mov r9, rdx + call streq + cmp rax, 1 + jne .not_equal + ; check if it could be an ident: + ; the buffer must not have ended + ; the next char must be id_continue + ; first char must be id_start + mov rax, [cursor] + add rax, rdx + cmp rax, [buffer_len] + jge .not_ident + mov cl, [buffer + rax] + call is_id_continue + cmp rax, 1 + jne .not_ident + ; check first char + mov rax, [cursor] + mov cl, [buffer + rax] + call is_id_start + cmp rax, 1 + jne .not_ident + ; this is an ident + ; move cursor forward while is_id_continue + mov r15, [cursor] + add r15, rdx +.try_lexeme_loop: + cmp r15, [buffer_len] + jge .done_ident + mov cl, [buffer + r15] + call is_id_continue + cmp rax, 1 + jne .done_ident + inc r15 + jmp .try_lexeme_loop +.done_ident: + mov [cursor], r15 + pop r15 + pop r8 + mov rax, TOKEN_IDENT + ret +.not_ident: + mov rax, [cursor] + add rax, rdx + mov [cursor], rax + pop r15 + pop r8 + mov rax, r8 + ret +.not_equal: + pop r15 + pop r8 + xor rax, rax + ret + +;; Returns: token enumerator +tokeniser_next_token: + ; check if at end of buffer +.loop: + mov rax, [cursor] + cmp rax, [buffer_len] + jge .eof + + ; get next 4 bytes + mov rbx, [buffer] + add rbx, rax + mov bl, byte [rbx] + + ; skip whitespace + cmp bl, ' ' + je .skip + + mov r15, 1 ; lexeme index +.inner_loop: + cmp r15, NUM_LEXEMES + ; TODO: numbers, idents + jge .skip + ; try lexeme + mov rcx, [LEXEMES + r15*8] + mov rdx, [LEXEME_LENS + r15*8] + mov r8, [TOKENS + r15] + call try_lexeme + cmp rax, 0 + jne .return_token + inc r15 + jmp .inner_loop +.skip: + mov rax, [cursor] + inc rax + mov [cursor], rax + jmp .loop +.eof: + mov rax, TOKEN_EOF + ret +.return_token: + mov rax, r15 + ret