Compare commits

..

8 commits

8 changed files with 410 additions and 67 deletions

View file

@ -25,6 +25,10 @@ test: test-bins
"$$b" || exit $$?; \ "$$b" || exit $$?; \
done done
fmt: $(wildcard tests/*.rs)
@echo "Formatting test source files..."
rustfmt --edition 2024 $^
# pattern rule: compile each .rs into a binary with the same base name # pattern rule: compile each .rs into a binary with the same base name
$(TARGET_DIR)/tests/%: tests/%.rs | $(OBJ) $(TARGET_DIR)/tests $(TARGET_DIR)/tests/%: tests/%.rs | $(OBJ) $(TARGET_DIR)/tests
@echo "[$(RUSTC)] $< -> $@" @echo "[$(RUSTC)] $< -> $@"
@ -37,7 +41,7 @@ $(TARGET_DIR)/tests: $(TARGET_DIR)
mkdir -p $(TARGET_DIR)/tests mkdir -p $(TARGET_DIR)/tests
$(TARGET_DIR)/%.o: src/%.asm | $(TARGET_DIR) $(TARGET_DIR)/%.o: src/%.asm | $(TARGET_DIR)
nasm -f elf64 -g $< -o $@ nasm -wreloc-abs -f elf64 -g $< -o $@
$(BIN): $(OBJ) $(BIN_OBJ) $(BIN): $(OBJ) $(BIN_OBJ)
mold -run ld -o $(BIN) $(OBJ) mold -run ld -o $(BIN) $(OBJ)

View file

@ -338,7 +338,7 @@ is_alpha:
jb .false jb .false
; && c <= 'z') { ; && c <= 'z') {
cmp dil, 'z' cmp dil, 'z'
jbe .true ja .false
; return true; ; return true;
.true: .true:
mov rax, 1 mov rax, 1
@ -354,7 +354,7 @@ is_numeric:
cmp dil, '0' cmp dil, '0'
jb .not_numeric jb .not_numeric
cmp dil, '9' cmp dil, '9'
jbe .is_numeric_ret ja .not_numeric
.is_numeric_ret: .is_numeric_ret:
mov rax, 1 mov rax, 1
ret ret
@ -370,9 +370,9 @@ is_id_continue:
call is_numeric call is_numeric
test rax, rax test rax, rax
jne .is_id_continue_ret jne .is_id_continue_ret
cmp cl, '_' cmp dil, '_'
je .is_id_continue_ret je .is_id_continue_ret
cmp cl, '-' cmp dil, '-'
je .is_id_continue_ret je .is_id_continue_ret
xor rax, rax xor rax, rax
ret ret
@ -385,7 +385,7 @@ is_id_start:
call is_alpha call is_alpha
test rax, rax test rax, rax
jne .is_ret jne .is_ret
cmp cl, '_' cmp dil, '_'
je .is_ret je .is_ret
xor rax, rax xor rax, rax
ret ret

View file

@ -310,10 +310,18 @@ NUM_LEXEMES: dq 30
TOKEN_NUMBER equ 31 TOKEN_NUMBER equ 31
LEX_NUMBER db "<number>" LEX_NUMBER db "<number>"
LEX_NUMBER_len equ $ - LEX_NUMBER LEX_NUMBER_len equ $ - LEX_NUMBER
TOKEN_STRING equ 32
LEX_STRING db "<string>"
LEX_STRING_len equ $ - LEX_STRING
TOKEN_COMMENT equ 33
LEX_COMMENT db "<comment>"
LEX_COMMENT_len equ $ - LEX_COMMENT
section .text section .text
;; rdi: length of matched lexeme ;; rdi: length of previously matched lexeme
;; returns the length of the ident
;; fn is_ident(lexeme_len: usize) -> usize
is_ident: is_ident:
push rbp push rbp
mov rbp, rsp mov rbp, rsp
@ -321,6 +329,7 @@ is_ident:
push r13 push r13
push r14 push r14
push rdi push rdi
mov rax, [rel cursor] mov rax, [rel cursor]
mov r12, [rel buffer] mov r12, [rel buffer]
mov r13, [rel buffer_len] mov r13, [rel buffer_len]
@ -351,7 +360,7 @@ is_ident:
mov rax, [rel cursor] mov rax, [rel cursor]
add rax, r14 add rax, r14
mov [rel cursor], rax mov [rel cursor], rax
mov rax, 1 mov rax, r14
jmp .epilogue jmp .epilogue
.not_ident: .not_ident:
xor rax, rax xor rax, rax
@ -363,8 +372,158 @@ is_ident:
pop rbp pop rbp
ret ret
is_number: is_comment:
push rbp
mov rbp, rsp
push r12
push r13
push r14
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
add r12, rax
sub r13, rax
mov dil, [r12]
cmp dil, '/'
jne .not_comment
mov r14, 1
cmp r14, r13
jge .not_comment
mov dil, [r12 + r14]
cmp dil, '/'
jne .not_comment
.loop:
inc r14
cmp r14, r13
jge .comment
mov dil, [r12 + r14]
cmp dil, 10 ; newline
jne .loop
.comment:
mov rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, r14
jmp .epilogue
.not_comment:
xor rax, rax xor rax, rax
.epilogue:
pop r14
pop r13
pop r12
pop rbp
ret
;; Strings are sequences of characters enclosed in double quotes
;; Strings span multiple lines, and may in the future contain escape sequences
is_string:
push rbp
mov rbp, rsp
push r12
push r13
push r14
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
add r12, rax
sub r13, rax
mov dil, [r12]
cmp dil, '"'
jne .not_string
mov r14, 1
.loop:
cmp r14, r13
jge .unterminated
mov dil, [r12 + r14]
cmp dil, '"'
je .string
cmp dil, 0x5c ; backslash
je .escape
inc r14
jmp .loop
.escape:
inc r14
cmp r14, r13
jge .unterminated
inc r14
jmp .loop
.string:
mov rax, [rel cursor]
inc r14 ; include closing quote
add rax, r14
mov [rel cursor], rax
mov rax, r14
jmp .epilogue
.unterminated:
;; TODO: report unterminated string error
mov rax, r14
jmp .epilogue
.not_string:
xor rax, rax
.epilogue:
pop r14
pop r13
pop r12
pop rbp
ret
;; Numbers are sequences of numeric characters, interspersed with underscores
;; The leading character must be numeric
;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
is_number:
push rbp
mov rbp, rsp
push r12
push r13
push r14
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
add r12, rax
sub r13, rax
mov dil, [r12]
call is_numeric
test rax, rax
je .not_number
mov r14, 1
.loop:
cmp r14, r13
jge .number
mov dil, [r12 + r14]
call is_whitespace
test rax, rax
jne .number
cmp dil, '_'
je .loop_next
call is_numeric
test rax, rax
je .not_number
.loop_next:
inc r14
jmp .loop
.number:
mov rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, r14
jmp .epilogue
.not_number:
xor rax, rax
.epilogue:
pop r14
pop r13
pop r12
pop rbp
ret ret
skip_whitespaces: skip_whitespaces:
@ -396,11 +555,22 @@ skip_whitespaces:
pop rbp pop rbp
ret ret
;; rdi: pointer to out-struct
;; fn find_lexeme() -> (u8, *const u8, usize)
find_lexeme: find_lexeme:
push rbp push rbp
mov rbp, rsp mov rbp, rsp
push rdi
; skip whitespaces ; skip whitespaces
call skip_whitespaces call skip_whitespaces
;; init out struct
mov rdi, [rsp]
mov rax, [rel buffer]
add rax, [rel cursor]
mov qword [rdi], 0
mov [rdi + 8], rax
mov qword [rdi + 16], 0
; check length ; check length
mov rax, [rel cursor] mov rax, [rel cursor]
mov rcx, [rel buffer_len] mov rcx, [rel buffer_len]
@ -411,11 +581,29 @@ find_lexeme:
.eof: .eof:
; return TOKEN_EOF; ; return TOKEN_EOF;
mov rax, TOKEN_EOF mov rax, TOKEN_EOF
pop rdi
pop rbp pop rbp
ret ret
; } ; }
.start: .start:
push r12 push r12
; test special tokens:
; if buffer[cursor] == '"' {
call is_string
test rax, rax
jne .is_string
; } else if buffer[cursor].is_numeric() {
call is_number
; return is_number();
test rax, rax
jne .is_number
; } else if buffer[cursor..][..2] == "//" {
call is_comment
; // skip to end of line
test rax, rax
jne .is_comment
; }
.loop_init:
mov r12, 1 mov r12, 1
; for 1..NUM_LEXEMES { ; for 1..NUM_LEXEMES {
.loop: .loop:
@ -444,26 +632,26 @@ find_lexeme:
test rax, rax test rax, rax
je .next je .next
; if is_ident() { ; if is_ident() {
mov rdi, rsi
call is_ident call is_ident
test rax, rax test rax, rax
; return TOKEN_IDENT; ; return TOKEN_IDENT;
jne .is_ident jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else { ; } else {
mov rdi, [rsp + 8]
mov rax, [rel cursor] mov rax, [rel cursor]
; cursor += len; ; cursor += len;
lea rdi, [rel LEXEME_LENS] lea rsi, [rel LEXEME_LENS]
mov rdi, [rdi + r12*8] mov rsi, [rsi + r12*8]
add rax, rdi add rax, rsi
mov [rel cursor], rax mov [rel cursor], rax
; return TOKENS[i]; ; return TOKENS[i];
lea rax, [rel TOKENS] lea rax, [rel TOKENS]
mov al, [rax + r12] mov al, [rax + r12]
and rax, 0xFF and rax, 0xFF
mov rdi, [rsp + 8]
mov [rdi], al
mov [rdi + 16], rsi
jmp .epilogue jmp .epilogue
; } ; }
.next: .next:
@ -473,26 +661,44 @@ find_lexeme:
; } ; }
.not_found: .not_found:
; if is_ident() { ; if is_ident() {
xor rdi, rdi
call is_ident call is_ident
test rax, rax test rax, rax
; return TOKEN_IDENT; ; return TOKEN_IDENT;
jne .is_ident jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else { ; } else {
; return TOKEN_EOF; ; return TOKEN_EOF;
mov rax, TOKEN_EOF mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_EOF
; } ; }
.epilogue: .epilogue:
pop r12 pop r12
pop rdi
pop rbp pop rbp
mov rax, rdi
ret ret
.is_ident: .is_ident:
mov rax, TOKEN_IDENT ; rax = len
; out.0 = TOKEN_IDENT
; out.1 = buffer.add(cursor - len)
; out.2 = len
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_IDENT
mov [rdi + 16], rax
jmp .epilogue jmp .epilogue
.is_number: .is_number:
mov rax, TOKEN_NUMBER mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_NUMBER
mov [rdi + 16], rax
jmp .epilogue
.is_string:
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_STRING
mov [rdi + 16], rax
jmp .epilogue
.is_comment:
mov rdi, [rsp + 8]
mov qword [rdi], TOKEN_COMMENT
mov [rdi + 16], rax
jmp .epilogue jmp .epilogue

View file

@ -3,21 +3,24 @@ extern "C" fn panic() -> ! {
panic!("Called panic from external code."); panic!("Called panic from external code.");
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug)]
struct Lexeme(u8); struct Lexeme(u8, &'static str);
impl PartialEq for Lexeme {
fn eq(&self, other: &Self) -> bool {
match self.0 {
// Identifiers and numbers compare both token and lexeme
30 | 31 => self.0 == other.0 && self.1 == other.1,
_ => self.0 == other.0,
}
}
}
impl Eq for Lexeme {}
impl Lexeme { impl Lexeme {
fn lex(&self) -> &'static str { fn lex(&self) -> &'static str {
// SAFETY: lens contains the correct length for each lexeme, and lexemes self.1
// contains pointers to valid 'static UTF-8 data.
unsafe {
core::str::from_utf8_unchecked(
core::slice::from_raw_parts(
(&raw const LEXEMES).add((self.0) as usize).read(),
(&raw const LEXEME_LENS).add((self.0) as usize).read(),
)
)
}
} }
} }
@ -25,10 +28,19 @@ trait AsLexeme {
fn as_lexeme(self) -> Option<Lexeme>; fn as_lexeme(self) -> Option<Lexeme>;
} }
impl AsLexeme for u8 { #[repr(C)]
struct LexemeRaw {
token: u8,
lexeme: *const u8,
len: usize,
}
impl AsLexeme for LexemeRaw {
fn as_lexeme(self) -> Option<Lexeme> { fn as_lexeme(self) -> Option<Lexeme> {
match self { let Self { token, lexeme, len } = self;
1.. => Some(Lexeme(self)), let slice =
unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
match token {
1.. => Some(Lexeme(token, slice)),
_ => None, _ => None,
} }
} }
@ -41,7 +53,8 @@ unsafe extern "C" {
unsafe fn is_ident(len: usize) -> bool; unsafe fn is_ident(len: usize) -> bool;
unsafe fn is_number(len: usize) -> bool; unsafe fn is_number(len: usize) -> bool;
unsafe fn skip_whitespace() -> (); unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> u8;
unsafe fn find_lexeme() -> LexemeRaw;
static mut LEXEMES: *const u8; static mut LEXEMES: *const u8;
static mut LEXEME_LENS: usize; static mut LEXEME_LENS: usize;
@ -79,37 +92,137 @@ fn main() {
tokeniser_init(c"tests/tokens/keywords.l".as_ptr()); tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
eprintln!("ok."); eprintln!("ok.");
assert_eq!(&collect_tokens()[..], &[ assert_eq!(
Lexeme(4), &collect_tokens()[..],
Lexeme(1), &[
Lexeme(2), Lexeme(4, ""),
Lexeme(3), Lexeme(1, ""),
Lexeme(4), Lexeme(2, ""),
Lexeme(8), Lexeme(3, ""),
Lexeme(13), Lexeme(4, ""),
Lexeme(11), Lexeme(8, ""),
Lexeme(10), Lexeme(13, ""),
Lexeme(9), Lexeme(11, ""),
Lexeme(5), Lexeme(10, ""),
][..]); Lexeme(9, ""),
Lexeme(5, ""),
][..]
);
eprint!("Initializing tokeniser.. "); eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/delimiters.l".as_ptr()); tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
eprintln!("ok."); eprintln!("ok.");
assert_eq!(&collect_tokens()[..], &[ assert_eq!(
Lexeme(19), &collect_tokens()[..],
Lexeme(18), &[
Lexeme(28), Lexeme(19, ""),
Lexeme(29), Lexeme(18, ""),
Lexeme(21), Lexeme(28, ""),
Lexeme(20), Lexeme(29, ""),
Lexeme(24), Lexeme(21, ""),
Lexeme(12), Lexeme(20, ""),
Lexeme(23), Lexeme(24, ""),
Lexeme(22), Lexeme(12, ""),
Lexeme(15), Lexeme(23, ""),
][..]); Lexeme(22, ""),
Lexeme(15, ""),
][..]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(30, "this-is-an-ident"),
Lexeme(30, "another_ident123"),
Lexeme(30, "_underscore_test"),
Lexeme(30, "mixedCASEIdent"),
Lexeme(30, "number12345"),
Lexeme(30, "____"),
Lexeme(30, "_"),
Lexeme(17, ""),
Lexeme(30, "leading-minus"),
Lexeme(30, "trailing-minus-"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/function.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(4, ""),
Lexeme(30, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
Lexeme(11, ""),
Lexeme(21, ""),
Lexeme(5, ""),
Lexeme(10, ""),
Lexeme(23, ""),
Lexeme(20, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/comment.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(33, ""),
Lexeme(4, ""),
Lexeme(30, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
Lexeme(11, ""),
Lexeme(21, ""),
Lexeme(33, ""),
Lexeme(5, ""),
Lexeme(10, ""),
Lexeme(23, ""),
Lexeme(20, ""),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/number.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(31, "1234"),
Lexeme(31, "123_345_"),
Lexeme(31, "1234____56"),
Lexeme(31, "1"),
Lexeme(31, "0"),
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/strings.l".as_ptr());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[[
Lexeme(32, "\"this is a string\""),
Lexeme(32, "\"another\nstring\nspanning multiple\n lines\""),
Lexeme(32, "\"string with a \\\"quoted\\\" word\""),
Lexeme(32, "\"a\""),
Lexeme(32, "\"\"")
],]
);
eprintln!("Finished tokenising."); eprintln!("Finished tokenising.");
} }

View file

@ -0,0 +1,5 @@
// This is a comment line
fn my-function() -> bool {
// This function always returns false
return false;
}

View file

@ -5,3 +5,5 @@ mixedCASEIdent
number12345 number12345
____ ____
_ _
-leading-minus
trailing-minus-

View file

@ -0,0 +1,5 @@
1234
123_345_
1234____56
1
0

View file

@ -0,0 +1,8 @@
"this is a string"
"another
string
spanning multiple
lines"
"string with a \"quoted\" word"
"a"
""