formatting, strings, comments
This commit is contained in:
parent
43a06ad55c
commit
5f63d4303e
|
|
@ -25,6 +25,10 @@ test: test-bins
|
|||
"$$b" || exit $$?; \
|
||||
done
|
||||
|
||||
fmt: $(wildcard tests/*.rs)
|
||||
@echo "Formatting test source files..."
|
||||
rustfmt --edition 2024 $^
|
||||
|
||||
# pattern rule: compile each .rs into a binary with the same base name
|
||||
$(TARGET_DIR)/tests/%: tests/%.rs | $(OBJ) $(TARGET_DIR)/tests
|
||||
@echo "[$(RUSTC)] $< -> $@"
|
||||
|
|
@ -37,7 +41,7 @@ $(TARGET_DIR)/tests: $(TARGET_DIR)
|
|||
mkdir -p $(TARGET_DIR)/tests
|
||||
|
||||
$(TARGET_DIR)/%.o: src/%.asm | $(TARGET_DIR)
|
||||
nasm -f elf64 -g $< -o $@
|
||||
nasm -wreloc-abs -f elf64 -g $< -o $@
|
||||
|
||||
$(BIN): $(OBJ) $(BIN_OBJ)
|
||||
mold -run ld -o $(BIN) $(OBJ)
|
||||
|
|
|
|||
|
|
@ -310,6 +310,12 @@ NUM_LEXEMES: dq 30
|
|||
TOKEN_NUMBER equ 31
|
||||
LEX_NUMBER db "<number>"
|
||||
LEX_NUMBER_len equ $ - LEX_NUMBER
|
||||
TOKEN_STRING equ 32
|
||||
LEX_STRING db "<string>"
|
||||
LEX_STRING_len equ $ - LEX_STRING
|
||||
TOKEN_COMMENT equ 33
|
||||
LEX_COMMENT db "<comment>"
|
||||
LEX_COMMENT_len equ $ - LEX_COMMENT
|
||||
|
||||
|
||||
section .text
|
||||
|
|
@ -366,6 +372,108 @@ is_ident:
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
is_comment:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
|
||||
mov rax, [rel cursor]
|
||||
mov r12, [rel buffer]
|
||||
mov r13, [rel buffer_len]
|
||||
add r12, rax
|
||||
sub r13, rax
|
||||
|
||||
mov dil, [r12]
|
||||
cmp dil, '/'
|
||||
jne .not_comment
|
||||
|
||||
mov r14, 1
|
||||
cmp r14, r13
|
||||
jge .not_comment
|
||||
mov dil, [r12 + r14]
|
||||
cmp dil, '/'
|
||||
jne .not_comment
|
||||
.loop:
|
||||
inc r14
|
||||
cmp r14, r13
|
||||
jge .comment
|
||||
mov dil, [r12 + r14]
|
||||
cmp dil, 10 ; newline
|
||||
jne .loop
|
||||
.comment:
|
||||
mov rax, [rel cursor]
|
||||
add rax, r14
|
||||
mov [rel cursor], rax
|
||||
mov rax, r14
|
||||
jmp .epilogue
|
||||
|
||||
.not_comment:
|
||||
xor rax, rax
|
||||
.epilogue:
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;; Strings are sequences of characters enclosed in double quotes
|
||||
;; Strings span multiple lines, and may in the future contain escape sequences
|
||||
is_string:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
|
||||
mov rax, [rel cursor]
|
||||
mov r12, [rel buffer]
|
||||
mov r13, [rel buffer_len]
|
||||
add r12, rax
|
||||
sub r13, rax
|
||||
|
||||
mov dil, [r12]
|
||||
cmp dil, '"'
|
||||
jne .not_string
|
||||
|
||||
mov r14, 1
|
||||
.loop:
|
||||
cmp r14, r13
|
||||
jge .unterminated
|
||||
mov dil, [r12 + r14]
|
||||
cmp dil, '"'
|
||||
je .string
|
||||
cmp dil, 0x5c ; backslash
|
||||
je .escape
|
||||
inc r14
|
||||
jmp .loop
|
||||
.escape:
|
||||
inc r14
|
||||
cmp r14, r13
|
||||
jge .unterminated
|
||||
inc r14
|
||||
jmp .loop
|
||||
.string:
|
||||
mov rax, [rel cursor]
|
||||
inc r14 ; include closing quote
|
||||
add rax, r14
|
||||
mov [rel cursor], rax
|
||||
mov rax, r14
|
||||
jmp .epilogue
|
||||
.unterminated:
|
||||
;; TODO: report unterminated string error
|
||||
mov rax, r14
|
||||
jmp .epilogue
|
||||
.not_string:
|
||||
xor rax, rax
|
||||
.epilogue:
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;; Numbers are sequences of numeric characters, interspersed with underscores
|
||||
;; The leading character must be numeric
|
||||
;; In the future, numbers may be prefixed with '0x' for hexadecimal or '0b' for binary.
|
||||
|
|
@ -479,6 +587,23 @@ find_lexeme:
|
|||
; }
|
||||
.start:
|
||||
push r12
|
||||
; test special tokens:
|
||||
; if buffer[cursor] == '"' {
|
||||
call is_string
|
||||
test rax, rax
|
||||
jne .is_string
|
||||
; } else if buffer[cursor].is_numeric() {
|
||||
call is_number
|
||||
; return is_number();
|
||||
test rax, rax
|
||||
jne .is_number
|
||||
; } else if buffer[cursor..][..2] == "//" {
|
||||
call is_comment
|
||||
; // skip to end of line
|
||||
test rax, rax
|
||||
jne .is_comment
|
||||
; }
|
||||
.loop_init:
|
||||
mov r12, 1
|
||||
; for 1..NUM_LEXEMES {
|
||||
.loop:
|
||||
|
|
@ -512,11 +637,6 @@ find_lexeme:
|
|||
test rax, rax
|
||||
; return TOKEN_IDENT;
|
||||
jne .is_ident
|
||||
; } else if is_number() {
|
||||
call is_number
|
||||
test rax, rax
|
||||
; return TOKEN_NUMBER;
|
||||
jne .is_number
|
||||
; } else {
|
||||
mov rdi, [rsp + 8]
|
||||
mov rax, [rel cursor]
|
||||
|
|
@ -546,11 +666,6 @@ find_lexeme:
|
|||
test rax, rax
|
||||
; return TOKEN_IDENT;
|
||||
jne .is_ident
|
||||
; } else if is_number() {
|
||||
call is_number
|
||||
test rax, rax
|
||||
; return TOKEN_NUMBER;
|
||||
jne .is_number
|
||||
; } else {
|
||||
; return TOKEN_EOF;
|
||||
mov rdi, [rsp + 8]
|
||||
|
|
@ -560,6 +675,7 @@ find_lexeme:
|
|||
pop r12
|
||||
pop rdi
|
||||
pop rbp
|
||||
mov rax, rdi
|
||||
ret
|
||||
.is_ident:
|
||||
; rax = len
|
||||
|
|
@ -575,3 +691,14 @@ find_lexeme:
|
|||
mov qword [rdi], TOKEN_NUMBER
|
||||
mov [rdi + 16], rax
|
||||
jmp .epilogue
|
||||
.is_string:
|
||||
mov rdi, [rsp + 8]
|
||||
mov qword [rdi], TOKEN_STRING
|
||||
mov [rdi + 16], rax
|
||||
jmp .epilogue
|
||||
.is_comment:
|
||||
|
||||
mov rdi, [rsp + 8]
|
||||
mov qword [rdi], TOKEN_COMMENT
|
||||
mov [rdi + 16], rax
|
||||
jmp .epilogue
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ impl PartialEq for Lexeme {
|
|||
// Identifiers and numbers compare both token and lexeme
|
||||
30 | 31 => self.0 == other.0 && self.1 == other.1,
|
||||
_ => self.0 == other.0,
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -37,10 +36,9 @@ struct LexemeRaw {
|
|||
}
|
||||
impl AsLexeme for LexemeRaw {
|
||||
fn as_lexeme(self) -> Option<Lexeme> {
|
||||
let Self {token, lexeme, len} = self;
|
||||
let slice = unsafe {core::str::from_utf8_unchecked(
|
||||
core::slice::from_raw_parts(
|
||||
lexeme, len))};
|
||||
let Self { token, lexeme, len } = self;
|
||||
let slice =
|
||||
unsafe { core::str::from_utf8_unchecked(core::slice::from_raw_parts(lexeme, len)) };
|
||||
match token {
|
||||
1.. => Some(Lexeme(token, slice)),
|
||||
_ => None,
|
||||
|
|
@ -94,83 +92,137 @@ fn main() {
|
|||
tokeniser_init(c"tests/tokens/keywords.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(&collect_tokens()[..], &[
|
||||
Lexeme(4, ""),
|
||||
Lexeme(1, ""),
|
||||
Lexeme(2, ""),
|
||||
Lexeme(3, ""),
|
||||
Lexeme(4, ""),
|
||||
Lexeme(8, ""),
|
||||
Lexeme(13, ""),
|
||||
Lexeme(11, ""),
|
||||
Lexeme(10, ""),
|
||||
Lexeme(9, ""),
|
||||
Lexeme(5, ""),
|
||||
][..]);
|
||||
assert_eq!(
|
||||
&collect_tokens()[..],
|
||||
&[
|
||||
Lexeme(4, ""),
|
||||
Lexeme(1, ""),
|
||||
Lexeme(2, ""),
|
||||
Lexeme(3, ""),
|
||||
Lexeme(4, ""),
|
||||
Lexeme(8, ""),
|
||||
Lexeme(13, ""),
|
||||
Lexeme(11, ""),
|
||||
Lexeme(10, ""),
|
||||
Lexeme(9, ""),
|
||||
Lexeme(5, ""),
|
||||
][..]
|
||||
);
|
||||
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/delimiters.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(&collect_tokens()[..], &[
|
||||
Lexeme(19, ""),
|
||||
Lexeme(18, ""),
|
||||
Lexeme(28, ""),
|
||||
Lexeme(29, ""),
|
||||
Lexeme(21, ""),
|
||||
Lexeme(20, ""),
|
||||
Lexeme(24, ""),
|
||||
Lexeme(12, ""),
|
||||
Lexeme(23, ""),
|
||||
Lexeme(22, ""),
|
||||
Lexeme(15, ""),
|
||||
][..]);
|
||||
assert_eq!(
|
||||
&collect_tokens()[..],
|
||||
&[
|
||||
Lexeme(19, ""),
|
||||
Lexeme(18, ""),
|
||||
Lexeme(28, ""),
|
||||
Lexeme(29, ""),
|
||||
Lexeme(21, ""),
|
||||
Lexeme(20, ""),
|
||||
Lexeme(24, ""),
|
||||
Lexeme(12, ""),
|
||||
Lexeme(23, ""),
|
||||
Lexeme(22, ""),
|
||||
Lexeme(15, ""),
|
||||
][..]
|
||||
);
|
||||
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/identifier.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(&collect_tokens()[..], &[
|
||||
Lexeme(30, "this-is-an-ident"),
|
||||
Lexeme(30, "another_ident123"),
|
||||
Lexeme(30, "_underscore_test"),
|
||||
Lexeme(30, "mixedCASEIdent"),
|
||||
Lexeme(30, "number12345"),
|
||||
Lexeme(30, "____"),
|
||||
Lexeme(30, "_"),
|
||||
Lexeme(17, ""), Lexeme(30, "leading-minus"),
|
||||
Lexeme(30, "trailing-minus-"),
|
||||
]);
|
||||
assert_eq!(
|
||||
&collect_tokens()[..],
|
||||
&[
|
||||
Lexeme(30, "this-is-an-ident"),
|
||||
Lexeme(30, "another_ident123"),
|
||||
Lexeme(30, "_underscore_test"),
|
||||
Lexeme(30, "mixedCASEIdent"),
|
||||
Lexeme(30, "number12345"),
|
||||
Lexeme(30, "____"),
|
||||
Lexeme(30, "_"),
|
||||
Lexeme(17, ""),
|
||||
Lexeme(30, "leading-minus"),
|
||||
Lexeme(30, "trailing-minus-"),
|
||||
]
|
||||
);
|
||||
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/function.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(&collect_tokens()[..], &[
|
||||
Lexeme(4, ""),
|
||||
Lexeme(30, "my-function"),
|
||||
Lexeme(19, ""),
|
||||
Lexeme(18, ""),
|
||||
Lexeme(12, ""),
|
||||
Lexeme(11, ""),
|
||||
Lexeme(21, ""),
|
||||
Lexeme(5, ""),
|
||||
Lexeme(10, ""),
|
||||
Lexeme(23, ""),
|
||||
Lexeme(20, ""),
|
||||
]);
|
||||
assert_eq!(
|
||||
&collect_tokens()[..],
|
||||
&[
|
||||
Lexeme(4, ""),
|
||||
Lexeme(30, "my-function"),
|
||||
Lexeme(19, ""),
|
||||
Lexeme(18, ""),
|
||||
Lexeme(12, ""),
|
||||
Lexeme(11, ""),
|
||||
Lexeme(21, ""),
|
||||
Lexeme(5, ""),
|
||||
Lexeme(10, ""),
|
||||
Lexeme(23, ""),
|
||||
Lexeme(20, ""),
|
||||
]
|
||||
);
|
||||
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/comment.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(
|
||||
&collect_tokens()[..],
|
||||
&[
|
||||
Lexeme(33, ""),
|
||||
Lexeme(4, ""),
|
||||
Lexeme(30, "my-function"),
|
||||
Lexeme(19, ""),
|
||||
Lexeme(18, ""),
|
||||
Lexeme(12, ""),
|
||||
Lexeme(11, ""),
|
||||
Lexeme(21, ""),
|
||||
Lexeme(33, ""),
|
||||
Lexeme(5, ""),
|
||||
Lexeme(10, ""),
|
||||
Lexeme(23, ""),
|
||||
Lexeme(20, ""),
|
||||
]
|
||||
);
|
||||
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/number.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(&collect_tokens()[..], &[
|
||||
Lexeme(31, "1234"),
|
||||
Lexeme(31, "123_345_"),
|
||||
Lexeme(31, "1234____56"),
|
||||
Lexeme(31, "1"),
|
||||
Lexeme(31, "0"),
|
||||
]);
|
||||
assert_eq!(
|
||||
&collect_tokens()[..],
|
||||
&[
|
||||
Lexeme(31, "1234"),
|
||||
Lexeme(31, "123_345_"),
|
||||
Lexeme(31, "1234____56"),
|
||||
Lexeme(31, "1"),
|
||||
Lexeme(31, "0"),
|
||||
]
|
||||
);
|
||||
|
||||
eprint!("Initializing tokeniser.. ");
|
||||
tokeniser_init(c"tests/tokens/strings.l".as_ptr());
|
||||
eprintln!("ok.");
|
||||
|
||||
assert_eq!(
|
||||
&collect_tokens()[..],
|
||||
&[[
|
||||
Lexeme(32, "\"this is a string\""),
|
||||
Lexeme(32, "\"another\nstring\nspanning multiple\n lines\""),
|
||||
Lexeme(32, "\"string with a \\\"quoted\\\" word\""),
|
||||
Lexeme(32, "\"a\""),
|
||||
Lexeme(32, "\"\"")
|
||||
],]
|
||||
);
|
||||
|
||||
eprintln!("Finished tokenising.");
|
||||
}
|
||||
|
|
|
|||
5
lang/tests/tokens/comment.l
Normal file
5
lang/tests/tokens/comment.l
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
// This is a comment line
|
||||
fn my-function() -> bool {
|
||||
// This function always returns false
|
||||
return false;
|
||||
}
|
||||
8
lang/tests/tokens/strings.l
Normal file
8
lang/tests/tokens/strings.l
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
"this is a string"
|
||||
"another
|
||||
string
|
||||
spanning multiple
|
||||
lines"
|
||||
"string with a \"quoted\" word"
|
||||
"a"
|
||||
""
|
||||
Loading…
Reference in a new issue