formatting, strings, comments

test numbers
is_number
2025-10-28 18:16:39 +01:00 · 2025-10-28 17:16:28 +01:00 · 2025-10-28 17:07:05 +01:00 · 2025-10-28 16:52:17 +01:00 · 2025-10-28 16:48:13 +01:00 · 2025-10-28 16:47:54 +01:00
8 changed files with 410 additions and 67 deletions
--- a/lang/Makefile
+++ b/lang/Makefile
@ -25,6 +25,10 @@ test: test-bins
 	  "$$b" || exit $$?; \
 	done

+fmt: $(wildcard tests/*.rs)
+	@echo "Formatting test source files..."
+	rustfmt --edition 2024 $^
+
 # pattern rule: compile each .rs into a binary with the same base name
 $(TARGET_DIR)/tests/%: tests/%.rs | $(OBJ) $(TARGET_DIR)/tests
 	@echo "[$(RUSTC)] $< -> $@"
@ -37,7 +41,7 @@ $(TARGET_DIR)/tests: $(TARGET_DIR)
 	mkdir -p $(TARGET_DIR)/tests

 $(TARGET_DIR)/%.o: src/%.asm | $(TARGET_DIR)
-	nasm -f elf64 -g $< -o $@
+	nasm -wreloc-abs -f elf64 -g $< -o $@

 $(BIN): $(OBJ) $(BIN_OBJ)
 	mold -run ld -o $(BIN) $(OBJ)
--- a/lang/src/lib.asm
+++ b/lang/src/lib.asm
@ -338,7 +338,7 @@ is_alpha:
    jb .false
    ; && c <= 'z') {
    cmp dil, 'z'
-    jbe .true
+    ja .false
    ;   return true;
 .true:
    mov rax, 1
@ -354,7 +354,7 @@ is_numeric:
    cmp dil, '0'
    jb .not_numeric
    cmp dil, '9'
-    jbe .is_numeric_ret
+    ja .not_numeric
 .is_numeric_ret:
    mov rax, 1
    ret
@ -370,9 +370,9 @@ is_id_continue:
    call is_numeric
    test rax, rax
    jne .is_id_continue_ret
-    cmp cl, '_'
+    cmp dil, '_'
    je .is_id_continue_ret
-    cmp cl, '-'
+    cmp dil, '-'
    je .is_id_continue_ret
    xor rax, rax
    ret
@ -385,7 +385,7 @@ is_id_start:
    call is_alpha
    test rax, rax
    jne .is_ret
-    cmp cl, '_'
+    cmp dil, '_'
    je .is_ret
    xor rax, rax
    ret
--- a/lang/src/tokeniser.asm
+++ b/lang/src/tokeniser.asm
@ -310,10 +310,18 @@ NUM_LEXEMES: dq 30
    TOKEN_NUMBER    equ 31
    LEX_NUMBER db "<number>"
    LEX_NUMBER_len equ $ - LEX_NUMBER
+    TOKEN_STRING    equ 32
+    LEX_STRING db "<string>"
+    LEX_STRING_len equ $ - LEX_STRING
+    TOKEN_COMMENT   equ 33
+    LEX_COMMENT db "<comment>"
+    LEX_COMMENT_len equ $ - LEX_COMMENT
    

 section .text
-;; rdi: length of matched lexeme
+;; rdi: length of previously matched lexeme
+;; returns the length of the ident
+;; fn is_ident(lexeme_len: usize) -> usize
 is_ident:
    push rbp
    mov rbp, rsp
@ -321,6 +329,7 @@ is_ident:
    push r13
    push r14
    push rdi
+
    mov rax, [rel cursor]
    mov r12, [rel buffer]
    mov r13, [rel buffer_len]
@ -351,7 +360,7 @@ is_ident:
    mov rax, [rel cursor]
    add rax, r14
    mov [rel cursor], rax
-    mov rax, 1
+    mov rax, r14
    jmp .epilogue
 .not_ident:
    xor rax, rax
@ -363,8 +372,158 @@ is_ident:
    pop rbp
    ret

-is_number:
+is_comment:
+    push rbp
+    mov rbp, rsp
+    push r12
+    push r13
+    push r14
+
+    mov rax, [rel cursor]
+    mov r12, [rel buffer]
+    mov r13, [rel buffer_len]
+    add r12, rax
+    sub r13, rax
+
+    mov dil, [r12]
+    cmp dil, '/'
+    jne .not_comment
+
+    mov r14, 1
+    cmp r14, r13
+    jge .not_comment
+    mov dil, [r12 + r14]
+    cmp dil, '/'
+    jne .not_comment
+.loop:
+    inc r14
+    cmp r14, r13
+    jge .comment
+    mov dil, [r12 + r14]
+    cmp dil, 10  ; newline
+    jne .loop
+.comment:
+    mov rax, [rel cursor]
+    add rax, r14
+    mov [rel cursor], rax
+    mov rax, r14
+    jmp .epilogue
+
+.not_comment:
    xor rax, rax
+.epilogue:
+    pop r14
+    pop r13
+    pop r12
+    pop rbp
+    ret
+
+;; Strings are sequences of characters enclosed in double quotes
+;; Strings span multiple lines, and may in the future contain escape sequences
+is_string:
+    push rbp
+    mov rbp, rsp
+    push r12
+    push r13
+    push r14
+
+    mov rax, [rel cursor]
+    mov r12, [rel buffer]
+    mov r13, [rel buffer_len]
+    add r12, rax
+    sub r13, rax
+
+    mov dil, [r12]
+    cmp dil, '"'
+    jne .not_string
+
+    mov r14, 1
+.loop:
+    cmp r14, r13
+    jge .unterminated
+    mov dil, [r12 + r14]
+    cmp dil, '"'
+    je .string
+    cmp dil, 0x5c  ; backslash
+    je .escape
+    inc r14
+    jmp .loop
+.escape:
+    inc r14
+    cmp r14, r13
+    jge .unterminated
+    inc r14
+    jmp .loop
+.string:
+    mov rax, [rel cursor]
+    inc r14                 ; include closing quote
+    add rax, r14
+    mov [rel cursor], rax
+    mov rax, r14
+    jmp .epilogue
+.unterminated:
+;; TODO: report unterminated string error
+    mov rax, r14
+    jmp .epilogue
+.not_string:
+    xor rax, rax
+.epilogue:
+    pop r14
+    pop r13
+    pop r12
+    pop rbp
+    ret
+
+;; Numbers are sequences of numeric characters, interspersed with underscores
+;; The leading character must be numeric
+;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
+is_number:
+    push rbp
+    mov rbp, rsp
+    push r12
+    push r13
+    push r14
+
+    mov rax, [rel cursor]
+    mov r12, [rel buffer]
+    mov r13, [rel buffer_len]
+    add r12, rax
+    sub r13, rax
+
+    mov dil, [r12]
+    call is_numeric
+    test rax, rax
+    je .not_number
+
+    mov r14, 1
+.loop:
+    cmp r14, r13
+    jge .number
+    mov dil, [r12 + r14]
+    call is_whitespace
+    test rax, rax
+    jne .number
+    cmp dil, '_'
+    je .loop_next
+    call is_numeric
+    test rax, rax
+    je .not_number
+.loop_next:
+    inc r14
+    jmp .loop
+.number:
+    mov rax, [rel cursor]
+    add rax, r14
+    mov [rel cursor], rax
+    mov rax, r14
+    jmp .epilogue
+.not_number:
+    xor rax, rax
+.epilogue:
+    pop r14
+    pop r13
+    pop r12
+    pop rbp
    ret

 skip_whitespaces:
@ -396,11 +555,22 @@ skip_whitespaces:
    pop rbp
    ret

+
+;; rdi: pointer to out-struct
+;; fn find_lexeme() -> (u8, *const u8, usize)
 find_lexeme:
    push rbp
    mov rbp, rsp
+    push rdi
    ; skip whitespaces
    call skip_whitespaces
+    ;; init out struct
+    mov rdi, [rsp]
+    mov rax, [rel buffer]
+    add rax, [rel cursor]
+    mov qword [rdi], 0
+    mov [rdi + 8], rax
+    mov qword [rdi + 16], 0
    ; check length
    mov rax, [rel cursor]
    mov rcx, [rel buffer_len]
@ -411,11 +581,29 @@ find_lexeme:
 .eof:
    ;   return TOKEN_EOF;
    mov rax, TOKEN_EOF
+    pop rdi
    pop rbp
    ret
    ; }
 .start:
    push r12
+    ; test special tokens:
+    ; if buffer[cursor] == '"' {
+    call is_string
+    test rax, rax
+    jne .is_string
+    ; } else if buffer[cursor].is_numeric() {
+    call is_number
+    ;   return is_number();
+    test rax, rax
+    jne .is_number
+    ; } else if buffer[cursor..][..2] == "//" {
+    call is_comment
+    ;   // skip to end of line
+    test rax, rax
+    jne .is_comment
+    ; }
+.loop_init:
    mov r12, 1
    ; for 1..NUM_LEXEMES {
 .loop:
@ -444,26 +632,26 @@ find_lexeme:
    test rax, rax
    je .next
    ;       if is_ident() {
+    mov rdi, rsi
    call is_ident
    test rax, rax
    ;         return TOKEN_IDENT;
    jne .is_ident
-    ;       } else if is_number() {
-    call is_number
-    test rax, rax
-    ;         return TOKEN_NUMBER;
-    jne .is_number
    ;       } else {
+    mov rdi, [rsp + 8]
    mov rax, [rel cursor]
    ;         cursor += len;
-    lea rdi, [rel LEXEME_LENS]
-    mov rdi, [rdi + r12*8]
-    add rax, rdi
+    lea rsi, [rel LEXEME_LENS]
+    mov rsi, [rsi + r12*8]
+    add rax, rsi
    mov [rel cursor], rax
    ;         return TOKENS[i];
    lea rax, [rel TOKENS]
    mov al, [rax + r12]
    and rax, 0xFF
+    mov rdi, [rsp + 8]
+    mov [rdi], al
+    mov [rdi + 16], rsi
    jmp .epilogue
    ;       }
 .next:
@ -473,26 +661,44 @@ find_lexeme:
    ; }
 .not_found:
    ; if is_ident() {
+    xor rdi, rdi
    call is_ident
    test rax, rax
    ;   return TOKEN_IDENT;
    jne .is_ident
-    ; } else if is_number() {
-    call is_number
-    test rax, rax
-    ;   return TOKEN_NUMBER;
-    jne .is_number
    ; } else {
    ;   return TOKEN_EOF;
-    mov rax, TOKEN_EOF
+    mov rdi, [rsp + 8]
+    mov qword [rdi], TOKEN_EOF
    ; }
 .epilogue:
    pop r12
+    pop rdi
    pop rbp
+    mov rax, rdi
    ret
 .is_ident:
-    mov rax, TOKEN_IDENT
+    ; rax = len
+    ; out.0 = TOKEN_IDENT
+    ; out.1 = buffer.add(cursor - len)
+    ; out.2 = len
+    mov rdi, [rsp + 8]
+    mov qword [rdi], TOKEN_IDENT
+    mov [rdi + 16], rax
    jmp .epilogue
 .is_number:
-    mov rax, TOKEN_NUMBER
+    mov rdi, [rsp + 8]
+    mov qword [rdi], TOKEN_NUMBER
+    mov [rdi + 16], rax
+    jmp .epilogue
+.is_string:
+    mov rdi, [rsp + 8]
+    mov qword [rdi], TOKEN_STRING
+    mov [rdi + 16], rax
+    jmp .epilogue
+.is_comment:
+
+    mov rdi, [rsp + 8]
+    mov qword [rdi], TOKEN_COMMENT
+    mov [rdi + 16], rax
    jmp .epilogue
--- a/lang/tests/tokens.rs
+++ b/lang/tests/tokens.rs
@ -3,21 +3,24 @@ extern "C" fn panic() -> ! {
    panic!("Called panic from external code.");
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-struct Lexeme(u8);
+#[derive(Debug)]
+struct Lexeme(u8, &'static str);
+
+impl PartialEq for Lexeme {
+    fn eq(&self, other: &Self) -> bool {
+        match self.0 {
+            // Identifiers and numbers compare both token and lexeme
+            30 | 31 => self.0 == other.0 && self.1 == other.1,
+            _ => self.0 == other.0,
+        }
+    }
+}
+
+impl Eq for Lexeme {}

 impl Lexeme {
    fn lex(&self) -> &'static str {
-        // SAFETY: lens contains the correct length for each lexeme, and lexemes
-        // contains pointers to valid 'static UTF-8 data.
-        unsafe {
-            core::str::from_utf8_unchecked(
-                core::slice::from_raw_parts(
-                    (&raw const LEXEMES).add((self.0) as usize).read(),
-                    (&raw const LEXEME_LENS).add((self.0) as usize).read(),
-                )
-            )
-        }
+        self.1
    }
 }

@ -25,10 +28,19 @@ trait AsLexeme {
    fn as_lexeme(self) -> Option<Lexeme>;
 }

-impl AsLexeme for u8 {
+#[repr(C)]
+struct LexemeRaw {
+    token: u8,
+    lexeme: *const u8,
+    len: usize,
+}
+impl AsLexeme for LexemeRaw {
    fn as_lexeme(self) -> Option<Lexeme> {
-        match self {
-            1.. => Some(Lexeme(self)),
+        let Self { token, lexeme, len } = self;
+        let slice =
+            unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
+        match token {
+            1.. => Some(Lexeme(token, slice)),
            _ => None,
        }
    }
@ -41,7 +53,8 @@ unsafe extern "C" {
    unsafe fn is_ident(len: usize) -> bool;
    unsafe fn is_number(len: usize) -> bool;
    unsafe fn skip_whitespace() -> ();
-    unsafe fn find_lexeme() -> u8;
+
+    unsafe fn find_lexeme() -> LexemeRaw;

    static mut LEXEMES: *const u8;
    static mut LEXEME_LENS: usize;
@ -79,37 +92,137 @@ fn main() {
        tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
        eprintln!("ok.");

-        assert_eq!(&collect_tokens()[..], &[
-            Lexeme(4),
-            Lexeme(1),
-            Lexeme(2),
-            Lexeme(3),
-            Lexeme(4),
-            Lexeme(8),
-            Lexeme(13),
-            Lexeme(11),
-            Lexeme(10),
-            Lexeme(9),
-            Lexeme(5),
-        ][..]);
+        assert_eq!(
+            &collect_tokens()[..],
+            &[
+                Lexeme(4, ""),
+                Lexeme(1, ""),
+                Lexeme(2, ""),
+                Lexeme(3, ""),
+                Lexeme(4, ""),
+                Lexeme(8, ""),
+                Lexeme(13, ""),
+                Lexeme(11, ""),
+                Lexeme(10, ""),
+                Lexeme(9, ""),
+                Lexeme(5, ""),
+            ][..]
+        );

        eprint!("Initializing tokeniser.. ");
        tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
        eprintln!("ok.");

-        assert_eq!(&collect_tokens()[..], &[
-            Lexeme(19),
-            Lexeme(18),
-            Lexeme(28),
-            Lexeme(29),
-            Lexeme(21),
-            Lexeme(20),
-            Lexeme(24),
-            Lexeme(12),
-            Lexeme(23),
-            Lexeme(22),
-            Lexeme(15),
-        ][..]);
+        assert_eq!(
+            &collect_tokens()[..],
+            &[
+                Lexeme(19, ""),
+                Lexeme(18, ""),
+                Lexeme(28, ""),
+                Lexeme(29, ""),
+                Lexeme(21, ""),
+                Lexeme(20, ""),
+                Lexeme(24, ""),
+                Lexeme(12, ""),
+                Lexeme(23, ""),
+                Lexeme(22, ""),
+                Lexeme(15, ""),
+            ][..]
+        );
+
+        eprint!("Initializing tokeniser.. ");
+        tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
+        eprintln!("ok.");
+
+        assert_eq!(
+            &collect_tokens()[..],
+            &[
+                Lexeme(30, "this-is-an-ident"),
+                Lexeme(30, "another_ident123"),
+                Lexeme(30, "_underscore_test"),
+                Lexeme(30, "mixedCASEIdent"),
+                Lexeme(30, "number12345"),
+                Lexeme(30, "____"),
+                Lexeme(30, "_"),
+                Lexeme(17, ""),
+                Lexeme(30, "leading-minus"),
+                Lexeme(30, "trailing-minus-"),
+            ]
+        );
+
+        eprint!("Initializing tokeniser.. ");
+        tokeniser_init(c"tests/tokens/function.l".as_ptr());
+        eprintln!("ok.");
+
+        assert_eq!(
+            &collect_tokens()[..],
+            &[
+                Lexeme(4, ""),
+                Lexeme(30, "my-function"),
+                Lexeme(19, ""),
+                Lexeme(18, ""),
+                Lexeme(12, ""),
+                Lexeme(11, ""),
+                Lexeme(21, ""),
+                Lexeme(5, ""),
+                Lexeme(10, ""),
+                Lexeme(23, ""),
+                Lexeme(20, ""),
+            ]
+        );
+
+        eprint!("Initializing tokeniser.. ");
+        tokeniser_init(c"tests/tokens/comment.l".as_ptr());
+        eprintln!("ok.");
+
+        assert_eq!(
+            &collect_tokens()[..],
+            &[
+                Lexeme(33, ""),
+                Lexeme(4, ""),
+                Lexeme(30, "my-function"),
+                Lexeme(19, ""),
+                Lexeme(18, ""),
+                Lexeme(12, ""),
+                Lexeme(11, ""),
+                Lexeme(21, ""),
+                Lexeme(33, ""),
+                Lexeme(5, ""),
+                Lexeme(10, ""),
+                Lexeme(23, ""),
+                Lexeme(20, ""),
+            ]
+        );
+
+        eprint!("Initializing tokeniser.. ");
+        tokeniser_init(c"tests/tokens/number.l".as_ptr());
+        eprintln!("ok.");
+
+        assert_eq!(
+            &collect_tokens()[..],
+            &[
+                Lexeme(31, "1234"),
+                Lexeme(31, "123_345_"),
+                Lexeme(31, "1234____56"),
+                Lexeme(31, "1"),
+                Lexeme(31, "0"),
+            ]
+        );
+
+        eprint!("Initializing tokeniser.. ");
+        tokeniser_init(c"tests/tokens/strings.l".as_ptr());
+        eprintln!("ok.");
+
+        assert_eq!(
+            &collect_tokens()[..],
+            &[[
+                Lexeme(32, "\"this is a string\""),
+                Lexeme(32, "\"another\nstring\nspanning multiple\n   lines\""),
+                Lexeme(32, "\"string with a \\\"quoted\\\" word\""),
+                Lexeme(32, "\"a\""),
+                Lexeme(32, "\"\"")
+            ],]
+        );

        eprintln!("Finished tokenising.");
    }
--- a/lang/tests/tokens/comment.l
+++ b/lang/tests/tokens/comment.l
@ -0,0 +1,5 @@
+// This is a comment line
+fn my-function() -> bool {
+  // This function always returns false
+  return false;
+}
--- a/lang/tests/tokens/identifier.l
+++ b/lang/tests/tokens/identifier.l
@ -5,3 +5,5 @@ mixedCASEIdent
 number12345
 ____
 _
+-leading-minus
+trailing-minus-
--- a/lang/tests/tokens/number.l
+++ b/lang/tests/tokens/number.l
@ -0,0 +1,5 @@
+1234
+123_345_
+1234____56
+1
+0
--- a/lang/tests/tokens/strings.l
+++ b/lang/tests/tokens/strings.l
@ -0,0 +1,8 @@
+"this is a string"
+"another
+string
+spanning multiple
+   lines"
+"string with a \"quoted\" word"
+"a"
+""
Author	SHA1	Message	Date
janis	5f63d4303e	formatting, strings, comments	2025-10-28 18:16:39 +01:00
janis	43a06ad55c	test numbers	2025-10-28 17:16:28 +01:00
janis	44ee13246e	is_number	2025-10-28 17:07:05 +01:00
janis	82104cfe0d	edge case identifiers, simple function parsing	2025-10-28 16:52:17 +01:00
janis	26c37e9dd3	test identifiers	2025-10-28 16:48:13 +01:00
janis	e8f6cfb44a	fall-through error in is_numeric, is_id_* wrong register	2025-10-28 16:47:54 +01:00
janis	1886231dcd	return lexeme str slice from find_lexeme function	2025-10-28 16:30:15 +01:00
janis	b495114937	fall-through error fix	2025-10-28 16:29:58 +01:00