formatting, strings, comments

test numbers
is_number
2025-10-28 18:16:39 +01:00 · 2025-10-28 17:16:28 +01:00 · 2025-10-28 17:07:05 +01:00 · 2025-10-28 16:52:17 +01:00 · 2025-10-28 16:48:13 +01:00 · 2025-10-28 16:47:54 +01:00
8 changed files with 410 additions and 67 deletions
--- a/lang/Makefile
+++ b/lang/Makefile
@ -25,6 +25,10 @@ test: test-bins
 	  "$$b" || exit $$?; \
 	done
 fmt: $(wildcard tests/*.rs)
 	@echo "Formatting test source files..."
 	rustfmt --edition 2024 $^
 # pattern rule: compile each .rs into a binary with the same base name
 $(TARGET_DIR)/tests/%: tests/%.rs | $(OBJ) $(TARGET_DIR)/tests
 	@echo "[$(RUSTC)] $< -> $@"
@ -37,7 +41,7 @@ $(TARGET_DIR)/tests: $(TARGET_DIR)
 	mkdir -p $(TARGET_DIR)/tests
 $(TARGET_DIR)/%.o: src/%.asm | $(TARGET_DIR)
-	nasm -f elf64 -g $< -o $@
+	nasm -wreloc-abs -f elf64 -g $< -o $@
 $(BIN): $(OBJ) $(BIN_OBJ)
 	mold -run ld -o $(BIN) $(OBJ)
--- a/lang/src/lib.asm
+++ b/lang/src/lib.asm
@ -338,7 +338,7 @@ is_alpha:
    jb .false
    ; && c <= 'z') {
    cmp dil, 'z'
-    jbe .true
+    ja .false
    ;   return true;
 .true:
    mov rax, 1
@ -354,7 +354,7 @@ is_numeric:
    cmp dil, '0'
    jb .not_numeric
    cmp dil, '9'
-    jbe .is_numeric_ret
+    ja .not_numeric
 .is_numeric_ret:
    mov rax, 1
    ret
@ -370,9 +370,9 @@ is_id_continue:
    call is_numeric
    test rax, rax
    jne .is_id_continue_ret
-    cmp cl, '_'
+    cmp dil, '_'
    je .is_id_continue_ret
-    cmp cl, '-'
+    cmp dil, '-'
    je .is_id_continue_ret
    xor rax, rax
    ret
@ -385,7 +385,7 @@ is_id_start:
    call is_alpha
    test rax, rax
    jne .is_ret
-    cmp cl, '_'
+    cmp dil, '_'
    je .is_ret
    xor rax, rax
    ret
--- a/lang/src/tokeniser.asm
+++ b/lang/src/tokeniser.asm
@ -310,10 +310,18 @@ NUM_LEXEMES: dq 30
    TOKEN_NUMBER    equ 31
    LEX_NUMBER db "<number>"
    LEX_NUMBER_len equ $ - LEX_NUMBER
    TOKEN_STRING    equ 32
    LEX_STRING db "<string>"
    LEX_STRING_len equ $ - LEX_STRING
    TOKEN_COMMENT   equ 33
    LEX_COMMENT db "<comment>"
    LEX_COMMENT_len equ $ - LEX_COMMENT
 section .text
-;; rdi: length of matched lexeme
+;; rdi: length of previously matched lexeme
 ;; returns the length of the ident
 ;; fn is_ident(lexeme_len: usize) -> usize
 is_ident:
    push rbp
    mov rbp, rsp
@ -321,6 +329,7 @@ is_ident:
    push r13
    push r14
    push rdi
    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
@ -351,7 +360,7 @@ is_ident:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
-    mov rax, 1
+    mov rax, r14
    jmp .epilogue
 .not_ident:
    xor rax, rax
@ -363,8 +372,158 @@ is_ident:
    pop rbp
    ret
-is_number:
+is_comment:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14
    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax
    mov dil, [r12]
    cmp dil, '/'
    jne .not_comment
    mov r14, 1
    cmp r14, r13
    jge .not_comment
    mov dil, [r12 + r14]
    cmp dil, '/'
    jne .not_comment
 .loop:
    inc r14
    cmp r14, r13
    jge .comment
    mov dil, [r12 + r14]
    cmp dil, 10  ; newline
    jne .loop
 .comment:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
 .not_comment:
    xor rax, rax
 .epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret
 ;; Strings are sequences of characters enclosed in double quotes
 ;; Strings span multiple lines, and may in the future contain escape sequences
 is_string:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14
    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax
    mov dil, [r12]
    cmp dil, '"'
    jne .not_string
    mov r14, 1
 .loop:
    cmp r14, r13
    jge .unterminated
    mov dil, [r12 + r14]
    cmp dil, '"'
    je .string
    cmp dil, 0x5c  ; backslash
    je .escape
    inc r14
    jmp .loop
 .escape:
    inc r14
    cmp r14, r13
    jge .unterminated
    inc r14
    jmp .loop
 .string:
    mov rax, [rel cursor]
    inc r14                 ; include closing quote
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
 .unterminated:
 ;; TODO: report unterminated string error
    mov rax, r14
    jmp .epilogue
 .not_string:
    xor rax, rax
 .epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret
 ;; Numbers are sequences of numeric characters, interspersed with underscores
 ;; The leading character must be numeric
 ;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
 is_number:
    push rbp
    mov rbp, rsp
    push r12
    push r13
    push r14
    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
    add r12, rax
    sub r13, rax
    mov dil, [r12]
    call is_numeric
    test rax, rax
    je .not_number
    mov r14, 1
 .loop:
    cmp r14, r13
    jge .number
    mov dil, [r12 + r14]
    call is_whitespace
    test rax, rax
    jne .number
    cmp dil, '_'
    je .loop_next
    call is_numeric
    test rax, rax
    je .not_number
 .loop_next:
    inc r14
    jmp .loop
 .number:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
    mov rax, r14
    jmp .epilogue
 .not_number:
    xor rax, rax
 .epilogue:
    pop r14
    pop r13
    pop r12
    pop rbp
    ret
 skip_whitespaces:
@ -396,11 +555,22 @@ skip_whitespaces:
    pop rbp
    ret
 ;; rdi: pointer to out-struct
 ;; fn find_lexeme() -> (u8, *const u8, usize)
 find_lexeme:
    push rbp
    mov rbp, rsp
    push rdi
    ; skip whitespaces
    call skip_whitespaces
    ;; init out struct
    mov rdi, [rsp]
    mov rax, [rel buffer]
    add rax, [rel cursor]
    mov qword [rdi], 0
    mov [rdi + 8], rax
    mov qword [rdi + 16], 0
    ; check length
    mov rax, [rel cursor]
    mov rcx, [rel buffer_len]
@ -411,11 +581,29 @@ find_lexeme:
 .eof:
    ;   return TOKEN_EOF;
    mov rax, TOKEN_EOF
    pop rdi
    pop rbp
    ret
    ; }
 .start:
    push r12
    ; test special tokens:
    ; if buffer[cursor] == '"' {
    call is_string
    test rax, rax
    jne .is_string
    ; } else if buffer[cursor].is_numeric() {
    call is_number
    ;   return is_number();
    test rax, rax
    jne .is_number
    ; } else if buffer[cursor..][..2] == "//" {
    call is_comment
    ;   // skip to end of line
    test rax, rax
    jne .is_comment
    ; }
 .loop_init:
    mov r12, 1
    ; for 1..NUM_LEXEMES {
 .loop:
@ -444,26 +632,26 @@ find_lexeme:
    test rax, rax
    je .next
    ;       if is_ident() {
    mov rdi, rsi
    call is_ident
    test rax, rax
    ;         return TOKEN_IDENT;
    jne .is_ident
    ;       } else if is_number() {
    call is_number
    test rax, rax
    ;         return TOKEN_NUMBER;
    jne .is_number
    ;       } else {
    mov rdi, [rsp + 8]
    mov rax, [rel cursor]
    ;         cursor += len;
-    lea rdi, [rel LEXEME_LENS]
+    lea rsi, [rel LEXEME_LENS]
-    mov rdi, [rdi + r12*8]
+    mov rsi, [rsi + r12*8]
-    add rax, rdi
+    add rax, rsi
    mov [rel cursor], rax
    ;         return TOKENS[i];
    lea rax, [rel TOKENS]
    mov al, [rax + r12]
    and rax, 0xFF
    mov rdi, [rsp + 8]
    mov [rdi], al
    mov [rdi + 16], rsi
    jmp .epilogue
    ;       }
 .next:
@ -473,26 +661,44 @@ find_lexeme:
    ; }
 .not_found:
    ; if is_ident() {
    xor rdi, rdi
    call is_ident
    test rax, rax
    ;   return TOKEN_IDENT;
    jne .is_ident
    ; } else if is_number() {
    call is_number
    test rax, rax
    ;   return TOKEN_NUMBER;
    jne .is_number
    ; } else {
    ;   return TOKEN_EOF;
-    mov rax, TOKEN_EOF
+    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_EOF
    ; }
 .epilogue:
    pop r12
    pop rdi
    pop rbp
    mov rax, rdi
    ret
 .is_ident:
-    mov rax, TOKEN_IDENT
+    ; rax = len
    ; out.0 = TOKEN_IDENT
    ; out.1 = buffer.add(cursor - len)
    ; out.2 = len
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_IDENT
    mov [rdi + 16], rax
    jmp .epilogue
 .is_number:
-    mov rax, TOKEN_NUMBER
+    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_NUMBER
    mov [rdi + 16], rax
    jmp .epilogue
 .is_string:
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_STRING
    mov [rdi + 16], rax
    jmp .epilogue
 .is_comment:
    mov rdi, [rsp + 8]
    mov qword [rdi], TOKEN_COMMENT
    mov [rdi + 16], rax
    jmp .epilogue
--- a/lang/tests/tokens.rs
+++ b/lang/tests/tokens.rs
@ -3,21 +3,24 @@ extern "C" fn panic() -> ! {
    panic!("Called panic from external code.");
 }
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug)]
-struct Lexeme(u8);
+struct Lexeme(u8, &'static str);
 impl PartialEq for Lexeme {
    fn eq(&self, other: &Self) -> bool {
        match self.0 {
            // Identifiers and numbers compare both token and lexeme
            30 | 31 => self.0 == other.0 && self.1 == other.1,
            _ => self.0 == other.0,
        }
    }
 }
 impl Eq for Lexeme {}
 impl Lexeme {
    fn lex(&self) -> &'static str {
-        // SAFETY: lens contains the correct length for each lexeme, and lexemes
+        self.1
        // contains pointers to valid 'static UTF-8 data.
        unsafe {
            core::str::from_utf8_unchecked(
                core::slice::from_raw_parts(
                    (&raw const LEXEMES).add((self.0) as usize).read(),
                    (&raw const LEXEME_LENS).add((self.0) as usize).read(),
                )
            )
        }
    }
 }
@ -25,10 +28,19 @@ trait AsLexeme {
    fn as_lexeme(self) -> Option<Lexeme>;
 }
-impl AsLexeme for u8 {
+#[repr(C)]
 struct LexemeRaw {
    token: u8,
    lexeme: *const u8,
    len: usize,
 }
 impl AsLexeme for LexemeRaw {
    fn as_lexeme(self) -> Option<Lexeme> {
-        match self {
+        let Self { token, lexeme, len } = self;
-            1.. => Some(Lexeme(self)),
+        let slice =
            unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
        match token {
            1.. => Some(Lexeme(token, slice)),
            _ => None,
        }
    }
@ -41,7 +53,8 @@ unsafe extern "C" {
    unsafe fn is_ident(len: usize) -> bool;
    unsafe fn is_number(len: usize) -> bool;
    unsafe fn skip_whitespace() -> ();
-    unsafe fn find_lexeme() -> u8;
+
    unsafe fn find_lexeme() -> LexemeRaw;
    static mut LEXEMES: *const u8;
    static mut LEXEME_LENS: usize;
@ -79,37 +92,137 @@ fn main() {
        tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
        eprintln!("ok.");
-        assert_eq!(&collect_tokens()[..], &[
+        assert_eq!(
-            Lexeme(4),
+            &collect_tokens()[..],
-            Lexeme(1),
+            &[
-            Lexeme(2),
+                Lexeme(4, ""),
-            Lexeme(3),
+                Lexeme(1, ""),
-            Lexeme(4),
+                Lexeme(2, ""),
-            Lexeme(8),
+                Lexeme(3, ""),
-            Lexeme(13),
+                Lexeme(4, ""),
-            Lexeme(11),
+                Lexeme(8, ""),
-            Lexeme(10),
+                Lexeme(13, ""),
-            Lexeme(9),
+                Lexeme(11, ""),
-            Lexeme(5),
+                Lexeme(10, ""),
-        ][..]);
+                Lexeme(9, ""),
                Lexeme(5, ""),
            ][..]
        );
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
        eprintln!("ok.");
-        assert_eq!(&collect_tokens()[..], &[
+        assert_eq!(
-            Lexeme(19),
+            &collect_tokens()[..],
-            Lexeme(18),
+            &[
-            Lexeme(28),
+                Lexeme(19, ""),
-            Lexeme(29),
+                Lexeme(18, ""),
-            Lexeme(21),
+                Lexeme(28, ""),
-            Lexeme(20),
+                Lexeme(29, ""),
-            Lexeme(24),
+                Lexeme(21, ""),
-            Lexeme(12),
+                Lexeme(20, ""),
-            Lexeme(23),
+                Lexeme(24, ""),
-            Lexeme(22),
+                Lexeme(12, ""),
-            Lexeme(15),
+                Lexeme(23, ""),
-        ][..]);
+                Lexeme(22, ""),
                Lexeme(15, ""),
            ][..]
        );
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
        eprintln!("ok.");
        assert_eq!(
            &collect_tokens()[..],
            &[
                Lexeme(30, "this-is-an-ident"),
                Lexeme(30, "another_ident123"),
                Lexeme(30, "_underscore_test"),
                Lexeme(30, "mixedCASEIdent"),
                Lexeme(30, "number12345"),
                Lexeme(30, "____"),
                Lexeme(30, "_"),
                Lexeme(17, ""),
                Lexeme(30, "leading-minus"),
                Lexeme(30, "trailing-minus-"),
            ]
        );
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/function.l".as_ptr());
        eprintln!("ok.");
        assert_eq!(
            &collect_tokens()[..],
            &[
                Lexeme(4, ""),
                Lexeme(30, "my-function"),
                Lexeme(19, ""),
                Lexeme(18, ""),
                Lexeme(12, ""),
                Lexeme(11, ""),
                Lexeme(21, ""),
                Lexeme(5, ""),
                Lexeme(10, ""),
                Lexeme(23, ""),
                Lexeme(20, ""),
            ]
        );
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/comment.l".as_ptr());
        eprintln!("ok.");
        assert_eq!(
            &collect_tokens()[..],
            &[
                Lexeme(33, ""),
                Lexeme(4, ""),
                Lexeme(30, "my-function"),
                Lexeme(19, ""),
                Lexeme(18, ""),
                Lexeme(12, ""),
                Lexeme(11, ""),
                Lexeme(21, ""),
                Lexeme(33, ""),
                Lexeme(5, ""),
                Lexeme(10, ""),
                Lexeme(23, ""),
                Lexeme(20, ""),
            ]
        );
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/number.l".as_ptr());
        eprintln!("ok.");
        assert_eq!(
            &collect_tokens()[..],
            &[
                Lexeme(31, "1234"),
                Lexeme(31, "123_345_"),
                Lexeme(31, "1234____56"),
                Lexeme(31, "1"),
                Lexeme(31, "0"),
            ]
        );
        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/strings.l".as_ptr());
        eprintln!("ok.");
        assert_eq!(
            &collect_tokens()[..],
            &[[
                Lexeme(32, "\"this is a string\""),
                Lexeme(32, "\"another\nstring\nspanning multiple\n   lines\""),
                Lexeme(32, "\"string with a \\\"quoted\\\" word\""),
                Lexeme(32, "\"a\""),
                Lexeme(32, "\"\"")
            ],]
        );
        eprintln!("Finished tokenising.");
    }
--- a/lang/tests/tokens/comment.l
+++ b/lang/tests/tokens/comment.l
@ -0,0 +1,5 @@
 // This is a comment line
 fn my-function() -> bool {
  // This function always returns false
  return false;
 }
--- a/lang/tests/tokens/identifier.l
+++ b/lang/tests/tokens/identifier.l
@ -5,3 +5,5 @@ mixedCASEIdent
 number12345
 ____
 _
 -leading-minus
 trailing-minus-
--- a/lang/tests/tokens/number.l
+++ b/lang/tests/tokens/number.l
@ -0,0 +1,5 @@
 1234
 123_345_
 1234____56
 1
 0
--- a/lang/tests/tokens/strings.l
+++ b/lang/tests/tokens/strings.l
@ -0,0 +1,8 @@
 "this is a string"
 "another
 string
 spanning multiple
   lines"
 "string with a \"quoted\" word"
 "a"
 ""
Author	SHA1	Message	Date
janis	5f63d4303e	formatting, strings, comments	2025-10-28 18:16:39 +01:00
janis	43a06ad55c	test numbers	2025-10-28 17:16:28 +01:00
janis	44ee13246e	is_number	2025-10-28 17:07:05 +01:00
janis	82104cfe0d	edge case identifiers, simple function parsing	2025-10-28 16:52:17 +01:00
janis	26c37e9dd3	test identifiers	2025-10-28 16:48:13 +01:00
janis	e8f6cfb44a	fall-through error in is_numeric, is_id_* wrong register	2025-10-28 16:47:54 +01:00
janis	1886231dcd	return lexeme str slice from find_lexeme function	2025-10-28 16:30:15 +01:00
janis	b495114937	fall-through error fix	2025-10-28 16:29:58 +01:00