reduce warnings, enable edition 2024, fix tokeniser (1/?)

This commit is contained in:
janis 2025-10-28 12:06:24 +01:00
parent f1faac639c
commit 719451b935
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8
8 changed files with 634 additions and 54 deletions

View file

@ -1,7 +1,7 @@
# Makefile: Compile and link main.asm using nasm and mold, intermediate files in target/
TARGET_DIR := target
SRC := src/lib.asm src/int_to_str.asm src/vec.asm
SRC := src/lib.asm src/int_to_str.asm src/vec.asm src/tokeniser.asm src/file.asm
OBJ := $(patsubst src/%.asm,$(TARGET_DIR)/%.o,$(SRC))
BIN_SRC := src/main.asm src/panic.asm
@ -28,7 +28,7 @@ test: test-bins
# pattern rule: compile each .rs into a binary with the same base name
$(TARGET_DIR)/tests/%: tests/%.rs | $(OBJ) $(TARGET_DIR)/tests
@echo "[$(RUSTC)] $< -> $@"
rustc -Clink-arg=-fuse-ld=mold $(OBJ_LINK_ARGS) -g -o $@ $<
rustc -Clink-arg=-fuse-ld=mold --edition=2024 $(OBJ_LINK_ARGS) -g -o $@ $<
$(TARGET_DIR):
mkdir -p $(TARGET_DIR)/tests

48
lang/src/file.asm Normal file
View file

@ -0,0 +1,48 @@
section .data
file_error_msg db "Could not open file: "
file_error_msg_len equ $ - file_error_msg
section .text
global fopen_read
extern strlen
extern eprint_str
extern panic
extern error_to_str
extern eprint_error
;; Opens file for reading:
;; rdi: pointer to filename (null-terminated)
fopen_read:
mov rax, 2 ; syscall: open
mov rsi, 0 ; flags: O_RDONLY
mov rdx, 0 ; mode
syscall
cmp rax, 0
jl .file_error
ret ;fd in rax
.file_error:
push rdi
mov rdi, rax
call eprint_error
lea rdi, [rel file_error_msg]
mov rsi, file_error_msg_len
call eprint_str
pop rdi
call strlen ; get length of filename
mov rsi, rax ; r9 = filename length
call eprint_str
mov rdi, 10
push rdi
mov rdi, rsp
mov rsi, 1
call eprint_str
pop rdi
call panic

View file

@ -26,6 +26,7 @@ global allocate
global is_alpha
global is_numeric
global is_whitespace
global is_id_continue
global is_id_start
@ -37,8 +38,8 @@ extern panic
;; Abort the program with a default panic message
oom:
lea rdi, [oom_msg]
lea rsi, [oom_msg_len]
lea rdi, [rel oom_msg]
mov rsi, oom_msg_len
call eprint_str
; exit with error code 1
mov rax, 60 ; syscall: exit
@ -291,8 +292,8 @@ eprint_error:
; let err_code = err_code;
push rdi
; eprint_str(ERROR_STR, ERROR_STR.len());
lea rdi, [error_msg]
lea rsi, [error_msg_len]
lea rdi, [rel error_msg]
mov rsi, error_msg_len
call eprint_str
; let (err, len) = error_to_str(err_code);
pop rdi
@ -365,10 +366,10 @@ is_numeric:
is_id_continue:
call is_alpha
test rax, rax
je .is_id_continue_ret
jne .is_id_continue_ret
call is_numeric
test rax, rax
je .is_id_continue_ret
jne .is_id_continue_ret
cmp cl, '_'
je .is_id_continue_ret
xor rax, rax
@ -381,7 +382,7 @@ is_id_continue:
is_id_start:
call is_alpha
test rax, rax
je .is_ret
jne .is_ret
cmp cl, '_'
je .is_ret
xor rax, rax

View file

@ -13,14 +13,13 @@ extern error_to_str
extern eprint_error
extern alloc_pages
extern allocate
extern fopen_read
extern is_alpha
extern is_numeric
extern is_id_continue
extern is_id_start
extern vec_tests
section .data
hello_msg db "Hello, World!", 10
hello_msg_len equ $ - hello_msg
@ -71,41 +70,6 @@ compiler_entry:
.exit:
call exit
;; Opens file for reading:
;; rdi: pointer to filename (null-terminated)
fopen_read:
mov rax, 2 ; syscall: open
mov rsi, 0 ; flags: O_RDONLY
mov rdx, 0 ; mode
syscall
cmp rax, 0
jl .file_error
ret ;fd in rax
.file_error:
push rdi
mov rdi, rax
call eprint_error
lea rdi, [rel file_error_msg]
lea rsi, [rel file_error_msg_len]
call eprint_str
pop rdi
call strlen ; get length of filename
mov rsi, rax ; r9 = filename length
call eprint_str
mov rdi, 10
push rdi
mov rdi, rsp
mov rsi, 1
call eprint_str
pop rdi
call panic
;; =============================
;; Tokeniser functions
;; =============================

478
lang/src/tokeniser.asm Normal file
View file

@ -0,0 +1,478 @@
section .text
extern panic
extern strlen
extern strcmp
extern streq
extern memcpy
extern eprint_str
extern exit
extern error_to_str
extern eprint_error
extern alloc_pages
extern allocate
extern fopen_read
extern is_alpha
extern is_numeric
extern is_id_continue
extern is_id_start
extern is_whitespace
global tokeniser_init
global tokeniser_print
global find_lexeme
;; =============================
;; Tokeniser functions
;; =============================
;; tokeniser state
section .data
global input_file
global buffer
global cursor
global buffer_len
input_file dd 0
buffer dq 0
cursor dq 0
buffer_len dq 0
;; each buffer is chunk_size bytes large
;; buffer header structure:
;; +0 (8 bytes): pointer buffer
;; +8 (8 bytes): size of buffer
;; Tokens:
;; [let, if, else, fn, return, loop, break, continue, true, false, i32, u32, bool, =, +, -, *, /, %, ==, !=, <, <=, >, >=, &&, ||, !, (, ), {, }, [, ], ;, ',', ]
section .bss
statbuf: resb 144
section .text
;; Initialises the tokeniser
;; rdx: pointer to filename (null-terminated)
tokeniser_init:
; open file for reading
; this panics if the file doesn't exist
call fopen_read
mov dword [rel input_file], eax ; store file descriptor
mov qword [rel cursor], 0
mov qword [rel buffer_len], 0
; fstat
mov rax, 5 ; syscall: fstat
mov rdi, [rel input_file] ; fd
lea rsi, [rel statbuf] ; statbuf
syscall
cmp rax, 0
jl .report_error
; get file size from statbuf
lea r15, [rel statbuf] ; st_size
mov r15, [r15 + 48] ; offset of st_size in stat struct
; allocate buffer
mov rdi, r15
call allocate
mov qword [rel buffer], rax
mov qword [rel buffer_len], r15
; read file into buffer
mov rax, 0 ; syscall: read
mov edi, [rel input_file] ; fd
mov rsi, [rel buffer] ; buf
mov rdx, [rel buffer_len] ; count
syscall
cmp rax, 0
jl .report_error
ret
.report_error:
mov rcx, rax
call eprint_error
call panic
section .rdata
tokeniser_buffer db "Tokeniser buffer: ", 10
tokeniser_buffer_len equ $ - tokeniser_buffer
section .text
tokeniser_print:
lea rdi, [rel tokeniser_buffer]
mov rsi, tokeniser_buffer_len
call eprint_str
mov rax, [rel cursor]
mov rdi, [rel buffer]
add rdi, rax
mov rsi, [rel buffer_len]
call eprint_str
ret
section .rdata
global LEXEMES
global TOKENS
global LEXEME_LENS
global NUM_LEXEMES
LEXEMES: dq \
LEX_NOT_A_LEXEME, \
LEX_LET, \
LEX_IF, \
LEX_ELSE, \
LEX_FN, \
LEX_RETURN, \
LEX_LOOP, \
LEX_BREAK, \
LEX_CONTINUE, \
LEX_TRUE, \
LEX_FALSE, \
LEX_BOOL, \
LEX_ARROW, \
LEX_I32, \
LEX_U32, \
LEX_EQUALS, \
LEX_PLUS, \
LEX_MINUS, \
LEX_RPARENS, \
LEX_LPARENS, \
LEX_RBRACE, \
LEX_LBRACE, \
LEX_COLON, \
LEX_SEMI, \
LEX_COMMA, \
LEX_PIPE, \
LEX_AMP, \
LEX_EQEQ
TOKENS: db \
TOKEN_EOF, \
TOKEN_LET, \
TOKEN_IF, \
TOKEN_ELSE, \
TOKEN_FN, \
TOKEN_RETURN, \
TOKEN_LOOP, \
TOKEN_BREAK, \
TOKEN_CONTINUE, \
TOKEN_TRUE, \
TOKEN_FALSE, \
TOKEN_BOOL, \
TOKEN_ARROW, \
TOKEN_I32, \
TOKEN_U32, \
TOKEN_EQUALS, \
TOKEN_PLUS, \
TOKEN_MINUS, \
TOKEN_RPARENS, \
TOKEN_LPARENS, \
TOKEN_RBRACE, \
TOKEN_LBRACE, \
TOKEN_COLON, \
TOKEN_SEMI, \
TOKEN_COMMA, \
TOKEN_PIPE, \
TOKEN_AMP, \
TOKEN_EQEQ
LEXEME_LENS: dq \
0, \
LEX_LET_len, \
LEX_IF_len, \
LEX_ELSE_len, \
LEX_FN_len, \
LEX_RETURN_len, \
LEX_LOOP_len, \
LEX_BREAK_len, \
LEX_CONTINUE_len, \
LEX_TRUE_len, \
LEX_FALSE_len, \
LEX_BOOL_len, \
LEX_ARROW_len, \
LEX_I32_len, \
LEX_U32_len, \
LEX_EQUALS_len, \
LEX_PLUS_len, \
LEX_MINUS_len, \
LEX_RPARENS_len, \
LEX_LPARENS_len, \
LEX_RBRACE_len, \
LEX_LBRACE_len, \
LEX_COLON_len, \
LEX_SEMI_len, \
LEX_COMMA_len, \
LEX_PIPE_len, \
LEX_AMP_len, \
LEX_EQEQ_len
NUM_LEXEMES equ 28
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
TOKEN_EOF equ 0
TOKEN_LET equ 1
LEX_LET db "let"
LEX_LET_len equ $ - LEX_LET
TOKEN_IF equ 2
LEX_IF db "if"
LEX_IF_len equ $ - LEX_IF
TOKEN_ELSE equ 3
LEX_ELSE db "else"
LEX_ELSE_len equ $ - LEX_ELSE
TOKEN_FN equ 4
LEX_FN db "fn"
LEX_FN_len equ $ - LEX_FN
TOKEN_RETURN equ 5
LEX_RETURN db "return"
LEX_RETURN_len equ $ - LEX_RETURN
TOKEN_LOOP equ 6
LEX_LOOP db "loop"
LEX_LOOP_len equ $ - LEX_LOOP
TOKEN_BREAK equ 7
LEX_BREAK db "break"
LEX_BREAK_len equ $ - LEX_BREAK
TOKEN_CONTINUE equ 8
LEX_CONTINUE db "continue"
LEX_CONTINUE_len equ $ - LEX_CONTINUE
TOKEN_TRUE equ 9
LEX_TRUE db "true"
LEX_TRUE_len equ $ - LEX_TRUE
TOKEN_FALSE equ 10
LEX_FALSE db "false"
LEX_FALSE_len equ $ - LEX_FALSE
TOKEN_BOOL equ 11
LEX_BOOL db "bool"
LEX_BOOL_len equ $ - LEX_BOOL
TOKEN_ARROW equ 12
LEX_ARROW db "->"
LEX_ARROW_len equ $ - LEX_ARROW
TOKEN_I32 equ 13
LEX_I32 db "i32"
LEX_I32_len equ $ - LEX_I32
TOKEN_U32 equ 14
LEX_U32 db "u32"
LEX_U32_len equ $ - LEX_U32
TOKEN_EQUALS equ 15
LEX_EQUALS db "="
LEX_EQUALS_len equ $ - LEX_EQUALS
TOKEN_PLUS equ 16
LEX_PLUS db "+"
LEX_PLUS_len equ $ - LEX_PLUS
TOKEN_MINUS equ 17
LEX_MINUS db "-"
LEX_MINUS_len equ $ - LEX_MINUS
TOKEN_RPARENS equ 18
LEX_RPARENS db ")"
LEX_RPARENS_len equ $ - LEX_RPARENS
TOKEN_LPARENS equ 19
LEX_LPARENS db "("
LEX_LPARENS_len equ $ - LEX_LPARENS
TOKEN_RBRACE equ 20
LEX_RBRACE db "}"
LEX_RBRACE_len equ $ - LEX_RBRACE
TOKEN_LBRACE equ 21
LEX_LBRACE db "{"
LEX_LBRACE_len equ $ - LEX_LBRACE
TOKEN_COLON equ 22
LEX_COLON db ":"
LEX_COLON_len equ $ - LEX_COLON
TOKEN_SEMI equ 23
LEX_SEMI db ";"
LEX_SEMI_len equ $ - LEX_SEMI
TOKEN_COMMA equ 24
LEX_COMMA db ","
LEX_COMMA_len equ $ - LEX_COMMA
TOKEN_PIPE equ 25
LEX_PIPE db "|"
LEX_PIPE_len equ $ - LEX_PIPE
TOKEN_AMP equ 26
LEX_AMP db "&"
LEX_AMP_len equ $ - LEX_AMP
TOKEN_EQEQ equ 27
LEX_EQEQ db "=="
LEX_EQEQ_len equ $ - LEX_EQEQ
TOKEN_IDENT equ 28
LEX_IDENT db "<identifier>"
LEX_IDENT_len equ $ - LEX_IDENT
TOKEN_NUMBER equ 29
LEX_NUMBER db "<number>"
LEX_NUMBER_len equ $ - LEX_NUMBER
section .text
;; rdi: length of matched lexeme
is_ident:
push rbp
mov rbp, rsp
push r12
push r13
push r14
push rdi
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
sub r13, rax
add r12, rax
; check first char is id_start
mov dil, [r12]
call is_id_start
test rax, rax
je .not_ident
mov r14, 1
.loop:
cmp r14, r13
jge .done
mov dil, [r12 + r14]
; check for id_continue
call is_id_continue
test rax, rax
je .done
inc r14
jmp .loop
.done:
; r14 is length of ident
mov rdi, [rsp]
cmp r14, rdi
jle .not_ident
mov rax, [rel cursor]
add rax, r14
mov [rel cursor], rax
mov rax, 1
jmp .epilogue
.not_ident:
xor rax, rax
.epilogue:
pop rdi
pop r14
pop r13
pop r12
pop rbp
ret
is_number:
xor rax, rax
ret
skip_whitespaces:
push rbp
mov rbp, rsp
push r12
push r13
; let start = buffer.add(cursor);
; let end = buffer.add(buffer_len);
mov rax, [rel cursor]
mov r12, [rel buffer]
mov r13, [rel buffer_len]
add r13, r12 ; end
add r12, rax ; start
; for ptr in start..end {
.loop:
cmp r12, r13
jge .done
mov dil, [r12]
call is_whitespace
test rax, rax
je .done
inc r12
jmp .loop
.done:
pop r13
pop r12
pop rbp
ret
find_lexeme:
push rbp
mov rbp, rsp
; skip whitespaces
call skip_whitespaces
; check length
mov rax, [rel cursor]
mov rcx, [rel buffer_len]
; if cursor >= buffer_len {
cmp rax, rcx
jge .eof
jmp .start
.eof:
; return TOKEN_EOF;
mov rax, TOKEN_EOF
pop rbp
ret
; }
.start:
push r12
mov r12, 1
; for 1..NUM_LEXEMES {
.loop:
cmp r12, NUM_LEXEMES
jge .not_found
; let lexeme = LEXEMES[i];
lea rdi, [rel LEXEMES]
mov rdi, [rdi + r12*8]
lea rdx, [rel LEXEME_LENS]
mov rsi, [rdx + r12*8]
mov rax, [rel cursor]
mov rdx, [rel buffer]
add rdx, rax
; let len = LEXEME_LENS[i];
mov rcx, [rel buffer_len]
sub rcx, rax
jo .not_found
; if lexeme.len() > buffer.len() - cursor {
cmp rsi, rcx
jg .not_found
; goto .not_found
; }
mov rcx, rsi
; if buffer[cursor..cursor+len] == lexeme {
call streq
test rax, rax
je .next
; if is_ident() {
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else {
mov rax, [rel cursor]
; cursor += len;
lea rdi, [rel LEXEME_LENS]
mov rdi, [rdi + r12*8]
add rax, rdi
mov [rel cursor], rax
; return TOKENS[i];
lea rax, [rel TOKENS]
mov al, [rax + r12]
and rax, 0xFF
jmp .epilogue
; }
.next:
inc r12
jmp .loop
; }
; }
.not_found:
; if is_ident() {
call is_ident
test rax, rax
; return TOKEN_IDENT;
jne .is_ident
; } else if is_number() {
call is_number
test rax, rax
; return TOKEN_NUMBER;
jne .is_number
; } else {
; return TOKEN_EOF;
mov rax, TOKEN_EOF
; }
.epilogue:
pop r12
pop rbp
ret
.is_ident:
mov rax, TOKEN_IDENT
jmp .epilogue
.is_number:
mov rax, TOKEN_NUMBER
jmp .epilogue

1
lang/tests/tokens.l Normal file
View file

@ -0,0 +1 @@
if let else fn

75
lang/tests/tokens.rs Normal file
View file

@ -0,0 +1,75 @@
#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
panic!("Called panic from external code.");
}
struct Lexeme(u8);
impl Lexeme {
fn lex(&self) -> &'static str {
// SAFETY: lens contains the correct length for each lexeme, and lexemes
// contains pointers to valid 'static UTF-8 data.
unsafe {
core::str::from_utf8_unchecked(
core::slice::from_raw_parts(
(&raw const LEXEMES).read().add((self.0) as usize).read(),
(&raw const LEXEME_LENS).read().add((self.0) as usize).read(),
)
)
}
}
}
trait AsLexeme {
fn as_lexeme(self) -> Option<Lexeme>;
}
impl AsLexeme for u8 {
fn as_lexeme(self) -> Option<Lexeme> {
match self {
1..=10 => Some(Lexeme(self)),
_ => None,
}
}
}
#[allow(dead_code)]
unsafe extern "C" {
unsafe fn tokeniser_init(path: *const i8) -> ();
unsafe fn tokeniser_print() -> ();
unsafe fn is_ident(len: usize) -> bool;
unsafe fn is_number(len: usize) -> bool;
unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> u8;
static mut LEXEMES: *mut *const u8;
static mut LEXEME_LENS: *mut usize;
static mut NUM_LEXEMES: usize;
static mut TOKENS: *mut u32;
static mut input_file: u32;
static mut buffer: *mut u8;
static mut cursor: usize;
static mut buffer_len: usize;
unsafe fn exit(code: i32) -> !;
}
fn main() {
let path = c"tests/tokens.l";
unsafe {
assert_eq!((&raw const input_file).read(), 0);
assert_eq!((&raw const buffer_len).read(), 0);
assert_eq!((&raw const cursor).read(), 0);
assert_eq!((&raw const buffer).read(), core::ptr::null_mut());
eprint!("Initializing tokeniser.. ");
tokeniser_init(path.as_ptr());
eprintln!("ok.");
eprintln!("{}: {:?}[{}..{}]", (&raw const input_file).read(), (&raw const buffer).read(), (&raw const cursor).read(), (&raw const buffer_len).read());
tokeniser_print();
find_lexeme().as_lexeme().map(|lexeme| {
eprintln!("Found lexeme: {}", lexeme.lex());
});
}
}

View file

@ -7,7 +7,7 @@ pub struct BlobVec {
pub drop: Option<extern "C" fn(*mut u8)>,
}
#[no_mangle]
#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
panic!("Called panic from external code.");
}
@ -20,14 +20,25 @@ unsafe extern "C" {
unsafe fn vec_push(vec: *mut BlobVec, elem: *const u8, size: usize);
unsafe fn vec_pop(vec: *mut BlobVec);
unsafe fn vec_get(vec: *mut BlobVec, index: usize) -> *mut u8;
unsafe fn vec_remove(vec: *mut BlobVec, index: usize);
unsafe fn vec_drop(vec: *mut BlobVec);
unsafe fn exit(code: i32) -> !;
#[allow(dead_code)]
unsafe fn vec_remove(vec: *mut BlobVec, index: usize);
#[allow(dead_code)]
unsafe fn vec_drop(vec: *mut BlobVec);
}
fn main() {
static mut DROPS: usize = 1;
fn get_drops() -> usize {
unsafe { (&raw const DROPS).read() }
}
unsafe fn update_drops(f: impl FnOnce(&mut usize)) {
unsafe {
let drops = &raw mut DROPS;
f(&mut *drops);
}
}
let mut vec = BlobVec {
data: core::ptr::null_mut(),
len: 0,
@ -43,7 +54,9 @@ fn main() {
extern "C" fn drop(ptr: *mut u8) {
unsafe {
DROPS *= ptr.cast::<u32>().read() as usize;
update_drops(|drops| {
*drops *= ptr.cast::<u32>().read() as usize;
});
}
}
@ -56,10 +69,10 @@ fn main() {
assert_eq!(as_slice::<u32>(&vec), &[2]);
let retrieved = *(vec_get(&mut vec, 0) as *mut u32);
assert_eq!(retrieved, 2);
assert_eq!(DROPS, 1);
assert_eq!(get_drops(), 1);
vec_pop(&mut vec);
assert_eq!(vec.len, 0);
assert_eq!(DROPS, 2);
assert_eq!(get_drops(), 2);
value = 3;
vec_push(&mut vec, &value as *const u32 as *const u8, 4);
assert_eq!(as_slice::<u32>(&vec), &[3]);
@ -69,7 +82,7 @@ fn main() {
assert_eq!(vec.len, 2);
vec_pop(&mut vec);
vec_pop(&mut vec);
assert_eq!(DROPS, 2 * 3 * 5);
assert_eq!(get_drops(), 2 * 3 * 5);
eprintln!("Push/pop test passed\n");
}
}