Compare commits

...

8 commits

Author SHA1 Message Date
janis 8f4d626968
ast tests 2025-10-29 22:10:34 +01:00
janis 4e55fa74f4
parse structs and functions from asm for rust tests 2025-10-29 22:10:14 +01:00
janis 5ae3e17693
initial ast 2025-10-29 20:39:32 +01:00
janis bf9d07b462
init tokeniser with buffer 2025-10-29 20:39:22 +01:00
janis 46053090f4
initial parsing 2025-10-29 16:21:15 +01:00
janis 39e8d6ae96
move defintions out of tokeniser into include file 2025-10-29 16:21:01 +01:00
janis 62751f30ab
move more stuff to shared test file 2025-10-29 16:20:42 +01:00
janis 86bbab90c3
modularise test with shared rust structs
add expect/unwrap token methods to tokeniser to aid with parsing
2025-10-29 14:00:17 +01:00
12 changed files with 1686 additions and 522 deletions

View file

@ -1,7 +1,7 @@
# Makefile: Compile and link main.asm using nasm and mold, intermediate files in target/
TARGET_DIR := target
SRC := src/lib.asm src/int_to_str.asm src/vec.asm src/tokeniser.asm src/file.asm src/alloc.asm
SRC := src/lib.asm src/int_to_str.asm src/vec.asm src/tokeniser.asm src/file.asm src/alloc.asm src/ast.asm
OBJ := $(patsubst src/%.asm,$(TARGET_DIR)/%.o,$(SRC))
BIN_SRC := src/main.asm src/panic.asm
@ -29,8 +29,12 @@ fmt: $(wildcard tests/*.rs)
@echo "Formatting test source files..."
rustfmt --edition 2024 $^
tests/shared/defs.rs: $(wildcard src/*)
@echo "Generating shared definitions for tests..."
./tests/asm_to_rust.py $^ -o $@
# pattern rule: compile each .rs into a binary with the same base name
$(TARGET_DIR)/tests/%: tests/%.rs | $(OBJ) $(TARGET_DIR)/tests
$(TARGET_DIR)/tests/%: tests/%.rs tests/shared/defs.rs $(OBJ) | $(TARGET_DIR)/tests
@echo "[$(RUSTC)] $< -> $@"
rustc -Clink-arg=-fuse-ld=mold --edition=2024 $(OBJ_LINK_ARGS) -g -o $@ $<

489
lang/src/ast.asm Normal file
View file

@ -0,0 +1,489 @@
default rel
%include "src/tokeniser.inc"
section .rdata
;; start-consts
AST_FUNCTION equ 1
AST_BLOCK equ 2
AST_VARIABLE equ 3
AST_NUMBER equ 4
AST_BINARY_OP equ 5
AST_RETURN_STATEMENT equ 6
TYPE_VOID equ 1
TYPE_BOOL equ 2
TYPE_I32 equ 3
TYPE_U32 equ 4
TYPE_STR equ 5
;; end-consts
section .text
extern vec_init_with
extern vec_push
extern vec_get
extern panic
extern memcpy
extern vec_binary_search_by
extern vec_insert
extern bump_alloc
extern tokeniser_init
extern find_lexeme
extern peek_lexeme
extern expect_token
extern unwrap_token
extern peek_expect_token
extern str_to_int
global parse_func
global parse_args
global parse_expr
global parse_binary_expr
global parse_primary_expr
global parse_statement
global parse_block
;; start very simple, with only functions and addition
;; ```rust
;; use super::vec::Vec;
;; ```
;; start-structs
;; struct Ast {
;; nodes: Vec<AstNode>,
;; }
;;
;; struct AstNode {
;; kind: u8,
;; data: *const (),
;; }
;;
;; struct Argument {
;; name: *const u8,
;; name_len: usize,
;; arg_type: Type,
;; }
;;
;; struct Type {
;; kind: u8,
;; }
;; end-structs
;; rdi: *mut Ast
;; define-fn: fn parse_func(ast: *mut Ast) -> u64
parse_func:
push rbp
mov rbp, rsp
push rdi
sub rsp, 48
; name: *const u8 [0..8]
; name_len: usize [8..16]
; args_ptr: *const Arg [16..24]
; args_len: usize [24..32]
; return_type: Type [32..40]
; body: u64 [40..48]
mov dil, TOKEN_FN
call unwrap_token
mov dil, TOKEN_IDENT
call unwrap_token
mov [rsp], rax ; function name
mov [rsp + 8], rdx ; function name length
mov dil, TOKEN_LPARENS
call unwrap_token
mov dil, TOKEN_RPARENS
call expect_token
test rax, rax
je .args
.after_args:
mov dil, TOKEN_ARROW
call unwrap_token
mov rdi, [rsp + 48] ; Ast
call parse_type
mov [rsp + 32], rax ; return type
mov dil, TOKEN_LBRACE
call peek_expect_token
test rax, rax
je panic
mov rdi, [rsp + 48] ; Ast
call parse_block
mov [rsp + 40], rax ; body
.epilogue:
mov rdi, 48
mov rsi, 8
call bump_alloc
mov rsi, rsp
mov rdi, rax
mov rdx, 48
call memcpy
mov byte [rsp], AST_FUNCTION ; kind
mov [rsp + 8], rdi ; data
mov rdi, [rsp + 48] ; Ast
lea rsi, [rsp]
call vec_push
mov rax, [rsp + 48] ; Ast
mov rax, [rdi + 8] ; return Ast.nodes.len()
dec rax
add rsp, 48
pop rdi
pop rbp
ret
.args:
mov rdi, [rsp + 48] ; Ast
call parse_args
mov [rsp + 16], rax ; args_ptr
mov [rsp + 24], rdx ; args_len
jmp .after_args
.panic:
call panic
;; rdi: *mut Ast
;; define-fn: fn parse_args(ast: *mut Ast) -> (*const Argument, usize)
parse_args:
push rbp
mov rbp, rsp
push rdi
sub rsp, 64
lea rdi, [rsp + 24] ; vec
mov rsi, 24 ; size of Argument
mov rdx, 0 ; drop = None
mov rcx, 16 ; capacity
call vec_init_with
.loop:
mov dil, TOKEN_RPARENS
call expect_token
test rax, rax
jz .done_args
mov dil, TOKEN_IDENT
call unwrap_token
mov [rsp], rax ; arg name
mov [rsp + 8], rdx ; arg name length
mov dil, TOKEN_COLON
call unwrap_token
mov rdi, [rsp + 64] ; Ast
call parse_type
mov [rsp + 16], rax ; arg type
lea rdi, [rsp + 24] ; vec
lea rsi, [rsp] ; arg
call vec_push
mov dil, TOKEN_COMMA
call expect_token
test rax, rax
jz .end_loop
jmp .loop
.end_loop:
mov dil, TOKEN_RPARENS
call unwrap_token
.done_args:
mov rax, [rsp + 24] ; args_ptr
mov rdx, [rsp + 32] ; args_len
add rsp, 64
pop rdi
pop rbp
ret
;; rdi: lexeme ptr
;; rsi: lexeme len
;; fn parse_number(lexeme: *const u8, lexeme_len: usize) -> u64
parse_number:
push rbp
mov rbp, rsp
push rbx
sub rsp, 16
mov [rsp], rdi ; lexeme ptr
mov [rsp + 8], rsi ; lexeme len
cmp rsi, 2
jbe .dec_radix
mov al, byte [rdi]
mov bl, byte [rdi + 1]
cmp bl, 'x'
jne .dec_radix
cmp al, '0'
je .hex_radix
cmp al, 'o'
je .oct_radix
cmp al, 'b'
je .bin_radix
jmp .panic ; invalid radix prefix
.hex_radix:
mov rax, 16
jmp .radix_set
.oct_radix:
mov rax, 8
jmp .radix_set
.bin_radix:
mov rax, 2
jmp .radix_set
.dec_radix:
mov rax, 10
jmp .parse
.radix_set:
add qword [rsp], 2
sub qword [rsp + 8], 2
.parse:
mov rdi, [rsp] ; lexeme ptr
mov rsi, [rsp + 8] ; lexeme len
mov rdx, rax ; radix
call str_to_int
add rsp, 16
pop rbx
pop rbp
ret
.panic:
call panic
;; rdi: *mut Ast
;; define-fn: fn parse_primary_expr(ast: *mut Ast) -> u64
parse_primary_expr:
push rbp
mov rbp, rsp
sub rsp, 24
mov [rsp], rdi ; Ast
mov dil, TOKEN_NUMBER
call expect_token
test rax, rax
jnz .number
jmp .panic
.number:
mov rdi, rax ; lexeme ptr
mov rsi, rdx ; lexeme len
call parse_number
mov rdi, [rsp] ; Ast
mov byte [rsp + 8], AST_NUMBER ; kind
mov [rsp + 16], rax ; data
lea rsi, [rsp + 8] ; AstNode
call vec_push
mov rdi, [rsp] ; Ast
mov rax, [rdi + 8] ; return Ast.nodes.len()
dec rax
add rsp, 24
pop rbp
ret
.panic:
call panic
;; rdi: *mut Ast
;; sil: precedence
;; define-fn: fn parse_binary_expr(ast: *mut Ast, precedence: u8) -> u64
parse_binary_expr:
push rbp
mov rbp, rsp
; size: 24, align: 8
; start-structs
; struct BinaryExpr {
; left: u64,
; operator: u8,
; right: u64,
; }
; end-structs
sub rsp, 64
; lexeme: Lexeme [32..56]
; right: u64 [24..32]
; precedence: u8 [17..18]
; operator: u8 [16..17]
; left: u64 [8..16]
; rdi: *mut Ast [0..8]
mov [rsp], rdi ; Ast
mov byte [rsp + 17], sil ; upper_precedence
mov byte [rsp + 16], 0
call parse_primary_expr
mov [rsp + 8], rax ; left
.loop:
lea rdi, [rsp + 32] ; lexeme
call peek_lexeme
mov rax, [rsp + 32]
mov dil, [rsp + 17]
cmp al, dil ; our_precedence <= upper_precedence
jle .done ; also covers some non-binary operator tokens
cmp al, TOKEN_PLUS
je .plus
jmp .done
.plus:
mov dil, TOKEN_PLUS
call unwrap_token
mov byte [rsp + 16], TOKEN_PLUS
jmp .right
.right:
mov rdi, [rsp] ; Ast
mov sil, [rsp + 16]
call parse_binary_expr
mov [rsp + 24], rax ; right
mov rdi, 24
mov rsi, 8
call bump_alloc
mov rdx, [rsp + 8] ; left
mov [rax + 0], rdx ; left
mov dl, byte [rsp + 16] ; operator
mov byte [rax + 8], dl ; operator
mov rdx, [rsp + 24] ; right
mov [rax + 16], rdx ; right
mov byte [rsp + 32], AST_BINARY_OP ; AstNode.kind
mov [rsp + 40], rax ; AstNode.data
mov rdi, [rsp] ; Ast
lea rsi, [rsp + 32] ; &AstNode
call vec_push
mov rdi, [rsp] ; Ast
mov rax, [rdi + 8] ; Ast.nodes.len()
dec rax
mov [rsp + 8], rax ; left
jmp .loop
.done:
mov rax, [rsp + 8] ; left
add rsp, 64
pop rbp
ret
;; rdi: *mut Ast
;; define-fn: fn parse_expr(ast: *mut Ast) -> u64
parse_expr:
push rbp
mov rbp, rsp
sub rsp, 8
mov [rsp], rdi ; Ast
mov sil, 0
call parse_binary_expr
add rsp, 8
pop rbp
ret
;; rdi: *mut Ast
;; define-fn: fn parse_statement(ast: *mut Ast) -> u64
parse_statement:
push rbp
mov rbp, rsp
sub rsp, 32
mov [rsp + 24], rdi ; Ast
mov dil, TOKEN_RETURN
call expect_token
test rax, rax
jz .return
jmp .panic
.return:
call parse_expr
mov [rsp + 16], rax ; expression
mov byte [rsp], AST_RETURN_STATEMENT ; kind
lea rax, [rsp + 16] ; data ptr
mov [rsp + 8], rax ; data
mov rdi, [rsp + 24] ; Ast
mov rsi, rsp ; AstNode
call vec_push
mov rdi, [rsp + 24] ; Ast
mov rax, [rdi + 8] ; Ast.nodes.len()
dec rax
mov [rsp], rax
mov dil, TOKEN_SEMI
call unwrap_token
mov rax, [rsp] ; expression
add rsp, 32
pop rbp
ret
.panic:
call panic
;; rdi: *mut Ast
;; define-fn: fn parse_block(ast: *mut Ast) -> u64
parse_block:
push rbp
mov rbp, rsp
push rdi
; start-structs
; struct Block {
; statements: *const u64,
; statements_len: usize,
; }
; end-structs
sub rsp, 56
; statements: Vec<Statement> [0..40]
; statement: u64 [40..48]
mov dil, TOKEN_LBRACE
call unwrap_token
.loop:
mov rdi, [rsp + 16] ; Ast
call parse_statement
test rax, rax
je .done
lea rdi, [rsp + 16] ; vec
mov [rsp + 8], rax ; statement
lea rsi, [rsp + 8]
call vec_push
jmp .loop
.done:
mov rdi, [rsp + 56] ; Ast
lea rsi, [rsp + 16] ; statements vec-slice
call vec_push
mov rdi, [rsp + 56] ; Ast
mov rax, [rdi + 8] ; Ast.nodes.len()
dec rax
add rsp, 56
pop rdi
pop rbp
ret
;; rdi: *mut Ast
;; define-fn: fn parse_type(ast: *mut Ast) -> Type
parse_type:
push rbp
mov rbp, rsp
push rdi
sub rsp, 24
mov rdi, rsp
call find_lexeme ; TODO: use peek here to allow failing gracefully
mov rax, [rsp] ; token kind
cmp al, TOKEN_I32
je .i32_type
cmp al, TOKEN_U32
je .u32_type
cmp al, TOKEN_VOID
je .void_type
cmp al, TOKEN_BOOL
je .bool_type
jmp .panic
.i32_type:
mov rax, TYPE_I32
jmp .epilogue
.u32_type:
mov rax, TYPE_U32
jmp .epilogue
.void_type:
mov rax, TYPE_VOID
jmp .epilogue
.bool_type:
mov rax, TYPE_BOOL
jmp .epilogue
.epilogue:
add rsp, 24
pop rdi
pop rbp
ret
.panic:
call panic

View file

@ -19,8 +19,13 @@ extern is_id_start
extern is_whitespace
global tokeniser_init
global tokeniser_init_buf
global tokeniser_print
global find_lexeme
global expect_token
global unwrap_token
global peek_expect_token
global peek_lexeme
;; =============================
;; Tokeniser functions
@ -50,6 +55,22 @@ section .bss
statbuf: resb 144
section .text
;; rdi: pointer to buffer
;; rsi: length of buffer
tokeniser_init_buf:
push rbp
mov rbp, rsp
mov dword [rel input_file], 0
mov qword [rel buffer], rdi
mov qword [rel buffer_len], rsi
mov qword [rel cursor], 0
pop rbp
ret
;; Initialises the tokeniser
;; rdx: pointer to filename (null-terminated)
tokeniser_init:
@ -113,210 +134,7 @@ global TOKENS
global LEXEME_LENS
global NUM_LEXEMES
align 8
LEXEMES:
dq LEX_NOT_A_LEXEME
dq LEX_LET
dq LEX_IF
dq LEX_ELSE
dq LEX_FN
dq LEX_RETURN
dq LEX_LOOP
dq LEX_BREAK
dq LEX_CONTINUE
dq LEX_TRUE
dq LEX_FALSE
dq LEX_BOOL
dq LEX_ARROW
dq LEX_I32
dq LEX_U32
dq LEX_EQUALS
dq LEX_PLUS
dq LEX_MINUS
dq LEX_RPARENS
dq LEX_LPARENS
dq LEX_RBRACE
dq LEX_LBRACE
dq LEX_COLON
dq LEX_SEMI
dq LEX_COMMA
dq LEX_PIPE
dq LEX_AMP
dq LEX_EQEQ
dq LEX_LBRACKET
dq LEX_RBRACKET
align 8
TOKENS:
db TOKEN_EOF ;; 0
db TOKEN_LET ;; 1
db TOKEN_IF ;; 2
db TOKEN_ELSE ;; 3
db TOKEN_FN ;; 4
db TOKEN_RETURN ;; 5
db TOKEN_LOOP ;; 6
db TOKEN_BREAK ;; 7
db TOKEN_CONTINUE ;; 8
db TOKEN_TRUE ;; 9
db TOKEN_FALSE ;; 10
db TOKEN_BOOL ;; 11
db TOKEN_ARROW ;; 12
db TOKEN_I32 ;; 13
db TOKEN_U32 ;; 14
db TOKEN_EQUALS ;; 15
db TOKEN_PLUS ;; 16
db TOKEN_MINUS ;; 17
db TOKEN_RPARENS ;; 18
db TOKEN_LPARENS ;; 19
db TOKEN_RBRACE ;; 20
db TOKEN_LBRACE ;; 21
db TOKEN_COLON ;; 22
db TOKEN_SEMI ;; 23
db TOKEN_COMMA ;; 24
db TOKEN_PIPE ;; 25
db TOKEN_AMP ;; 26
db TOKEN_EQEQ ;; 27
db TOKEN_LBRACKET ;; 28
db TOKEN_RBRACKET ;; 29
align 8
LEXEME_LENS:
dq 0
dq LEX_LET_len
dq LEX_IF_len
dq LEX_ELSE_len
dq LEX_FN_len
dq LEX_RETURN_len
dq LEX_LOOP_len
dq LEX_BREAK_len
dq LEX_CONTINUE_len
dq LEX_TRUE_len
dq LEX_FALSE_len
dq LEX_BOOL_len
dq LEX_ARROW_len
dq LEX_I32_len
dq LEX_U32_len
dq LEX_EQUALS_len
dq LEX_PLUS_len
dq LEX_MINUS_len
dq LEX_RPARENS_len
dq LEX_LPARENS_len
dq LEX_RBRACE_len
dq LEX_LBRACE_len
dq LEX_COLON_len
dq LEX_SEMI_len
dq LEX_COMMA_len
dq LEX_PIPE_len
dq LEX_AMP_len
dq LEX_EQEQ_len
dq LEX_LBRACKET_len
dq LEX_RBRACKET_len
align 8
NUM_LEXEMES: dq 30
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
TOKEN_EOF equ 0
TOKEN_LET equ 1
LEX_LET db "let"
LEX_LET_len equ $ - LEX_LET
TOKEN_IF equ 2
LEX_IF db "if"
LEX_IF_len equ $ - LEX_IF
TOKEN_ELSE equ 3
LEX_ELSE db "else"
LEX_ELSE_len equ $ - LEX_ELSE
TOKEN_FN equ 4
LEX_FN db "fn"
LEX_FN_len equ $ - LEX_FN
TOKEN_RETURN equ 5
LEX_RETURN db "return"
LEX_RETURN_len equ $ - LEX_RETURN
TOKEN_LOOP equ 6
LEX_LOOP db "loop"
LEX_LOOP_len equ $ - LEX_LOOP
TOKEN_BREAK equ 7
LEX_BREAK db "break"
LEX_BREAK_len equ $ - LEX_BREAK
TOKEN_CONTINUE equ 8
LEX_CONTINUE db "continue"
LEX_CONTINUE_len equ $ - LEX_CONTINUE
TOKEN_TRUE equ 9
LEX_TRUE db "true"
LEX_TRUE_len equ $ - LEX_TRUE
TOKEN_FALSE equ 10
LEX_FALSE db "false"
LEX_FALSE_len equ $ - LEX_FALSE
TOKEN_BOOL equ 11
LEX_BOOL db "bool"
LEX_BOOL_len equ $ - LEX_BOOL
TOKEN_ARROW equ 12
LEX_ARROW db "->"
LEX_ARROW_len equ $ - LEX_ARROW
TOKEN_I32 equ 13
LEX_I32 db "i32"
LEX_I32_len equ $ - LEX_I32
TOKEN_U32 equ 14
LEX_U32 db "u32"
LEX_U32_len equ $ - LEX_U32
TOKEN_EQUALS equ 15
LEX_EQUALS db "="
LEX_EQUALS_len equ $ - LEX_EQUALS
TOKEN_PLUS equ 16
LEX_PLUS db "+"
LEX_PLUS_len equ $ - LEX_PLUS
TOKEN_MINUS equ 17
LEX_MINUS db "-"
LEX_MINUS_len equ $ - LEX_MINUS
TOKEN_RPARENS equ 18
LEX_RPARENS db ")"
LEX_RPARENS_len equ $ - LEX_RPARENS
TOKEN_LPARENS equ 19
LEX_LPARENS db "("
LEX_LPARENS_len equ $ - LEX_LPARENS
TOKEN_RBRACE equ 20
LEX_RBRACE db "}"
LEX_RBRACE_len equ $ - LEX_RBRACE
TOKEN_LBRACE equ 21
LEX_LBRACE db "{"
LEX_LBRACE_len equ $ - LEX_LBRACE
TOKEN_COLON equ 22
LEX_COLON db ":"
LEX_COLON_len equ $ - LEX_COLON
TOKEN_SEMI equ 23
LEX_SEMI db ";"
LEX_SEMI_len equ $ - LEX_SEMI
TOKEN_COMMA equ 24
LEX_COMMA db ","
LEX_COMMA_len equ $ - LEX_COMMA
TOKEN_PIPE equ 25
LEX_PIPE db "|"
LEX_PIPE_len equ $ - LEX_PIPE
TOKEN_AMP equ 26
LEX_AMP db "&"
LEX_AMP_len equ $ - LEX_AMP
TOKEN_EQEQ equ 27
LEX_EQEQ db "=="
LEX_EQEQ_len equ $ - LEX_EQEQ
TOKEN_LBRACKET equ 28
LEX_LBRACKET db "["
LEX_LBRACKET_len equ $ - LEX_LBRACKET
TOKEN_RBRACKET equ 29
LEX_RBRACKET db "]"
LEX_RBRACKET_len equ $ - LEX_RBRACKET
TOKEN_IDENT equ 30
LEX_IDENT db "<identifier>"
LEX_IDENT_len equ $ - LEX_IDENT
TOKEN_NUMBER equ 31
LEX_NUMBER db "<number>"
LEX_NUMBER_len equ $ - LEX_NUMBER
TOKEN_STRING equ 32
LEX_STRING db "<string>"
LEX_STRING_len equ $ - LEX_STRING
TOKEN_COMMENT equ 33
LEX_COMMENT db "<comment>"
LEX_COMMENT_len equ $ - LEX_COMMENT
%include "src/tokeniser.inc"
section .text
;; rdi: length of previously matched lexeme
@ -702,3 +520,66 @@ find_lexeme:
mov qword [rdi], TOKEN_COMMENT
mov [rdi + 16], rax
jmp .epilogue
;; dil: expected token
expect_token:
push rbp
mov rbp, rsp
sub rsp, 0x30
mov [rsp], dil
mov rax, [rel cursor] ; current cursor
mov [rsp + 8], rax
lea rdi, [rsp + 0x10]
call find_lexeme
mov rax, [rsp + 0x10] ; found token
mov dil, [rsp] ; expected token
cmp al, dil
je .matched
mov rdi, [rsp + 8] ; restore cursor
mov [rel cursor], rdi ; restore cursor
xor rax, rax
xor rdx, rdx
jmp .epilogue
.matched:
mov rax, [rsp + 0x18] ; lexeme pointer
mov rdx, [rsp + 0x20] ; lexeme length
.epilogue:
add rsp, 0x30
pop rbp
ret
;; dil: expected token
unwrap_token:
push rbp
mov rbp, rsp
call expect_token
test rax, rax
jz .panic
pop rbp
ret
.panic:
call panic
;; dil: expected token
peek_expect_token:
push rbp
mov rbp, rsp
push qword [rel cursor]
call expect_token
pop rdi
mov [rel cursor], rdi
pop rbp
ret
;; rdi: out-struct pointer
peek_lexeme:
push rbp
mov rbp, rsp
push rdi
push qword [rel cursor] ; save cursor
call find_lexeme
pop rdi
mov [rel cursor], rdi ; restore cursor
pop rax
pop rbp
ret

213
lang/src/tokeniser.inc Normal file
View file

@ -0,0 +1,213 @@
section .rdata
align 8
LEXEMES:
dq LEX_NOT_A_LEXEME
dq LEX_LET
dq LEX_IF
dq LEX_ELSE
dq LEX_FN
dq LEX_RETURN
dq LEX_LOOP
dq LEX_BREAK
dq LEX_CONTINUE
dq LEX_TRUE
dq LEX_FALSE
dq LEX_BOOL
dq LEX_ARROW
dq LEX_I32
dq LEX_U32
dq LEX_EQUALS
dq LEX_PLUS
dq LEX_MINUS
dq LEX_RPARENS
dq LEX_LPARENS
dq LEX_RBRACE
dq LEX_LBRACE
dq LEX_COLON
dq LEX_SEMI
dq LEX_COMMA
dq LEX_PIPE
dq LEX_AMP
dq LEX_EQEQ
dq LEX_LBRACKET
dq LEX_RBRACKET
dq LEX_VOID
align 8
TOKENS:
db TOKEN_EOF ;; 0
db TOKEN_LET ;; 1
db TOKEN_IF ;; 2
db TOKEN_ELSE ;; 3
db TOKEN_FN ;; 4
db TOKEN_RETURN ;; 5
db TOKEN_LOOP ;; 6
db TOKEN_BREAK ;; 7
db TOKEN_CONTINUE ;; 8
db TOKEN_TRUE ;; 9
db TOKEN_FALSE ;; 10
db TOKEN_BOOL ;; 11
db TOKEN_ARROW ;; 12
db TOKEN_I32 ;; 13
db TOKEN_U32 ;; 14
db TOKEN_EQUALS ;; 15
db TOKEN_PLUS ;; 16
db TOKEN_MINUS ;; 17
db TOKEN_RPARENS ;; 18
db TOKEN_LPARENS ;; 19
db TOKEN_RBRACE ;; 20
db TOKEN_LBRACE ;; 21
db TOKEN_COLON ;; 22
db TOKEN_SEMI ;; 23
db TOKEN_COMMA ;; 24
db TOKEN_PIPE ;; 25
db TOKEN_AMP ;; 26
db TOKEN_EQEQ ;; 27
db TOKEN_LBRACKET ;; 28
db TOKEN_RBRACKET ;; 29
db TOKEN_VOID ;; 30
align 8
LEXEME_LENS:
dq 0
dq LEX_LET_len
dq LEX_IF_len
dq LEX_ELSE_len
dq LEX_FN_len
dq LEX_RETURN_len
dq LEX_LOOP_len
dq LEX_BREAK_len
dq LEX_CONTINUE_len
dq LEX_TRUE_len
dq LEX_FALSE_len
dq LEX_BOOL_len
dq LEX_ARROW_len
dq LEX_I32_len
dq LEX_U32_len
dq LEX_EQUALS_len
dq LEX_PLUS_len
dq LEX_MINUS_len
dq LEX_RPARENS_len
dq LEX_LPARENS_len
dq LEX_RBRACE_len
dq LEX_LBRACE_len
dq LEX_COLON_len
dq LEX_SEMI_len
dq LEX_COMMA_len
dq LEX_PIPE_len
dq LEX_AMP_len
dq LEX_EQEQ_len
dq LEX_LBRACKET_len
dq LEX_RBRACKET_len
dq LEX_VOID_len
align 8
NUM_LEXEMES: dq 30
LEX_NOT_A_LEXEME db "<not a lexeme>", 0
LEX_LET db "let"
LEX_LET_len equ $ - LEX_LET
LEX_IF db "if"
LEX_IF_len equ $ - LEX_IF
LEX_ELSE db "else"
LEX_ELSE_len equ $ - LEX_ELSE
LEX_FN db "fn"
LEX_FN_len equ $ - LEX_FN
LEX_RETURN db "return"
LEX_RETURN_len equ $ - LEX_RETURN
LEX_LOOP db "loop"
LEX_LOOP_len equ $ - LEX_LOOP
LEX_BREAK db "break"
LEX_BREAK_len equ $ - LEX_BREAK
LEX_CONTINUE db "continue"
LEX_CONTINUE_len equ $ - LEX_CONTINUE
LEX_TRUE db "true"
LEX_TRUE_len equ $ - LEX_TRUE
LEX_FALSE db "false"
LEX_FALSE_len equ $ - LEX_FALSE
LEX_BOOL db "bool"
LEX_BOOL_len equ $ - LEX_BOOL
LEX_ARROW db "->"
LEX_ARROW_len equ $ - LEX_ARROW
LEX_I32 db "i32"
LEX_I32_len equ $ - LEX_I32
LEX_U32 db "u32"
LEX_U32_len equ $ - LEX_U32
LEX_EQUALS db "="
LEX_EQUALS_len equ $ - LEX_EQUALS
LEX_PLUS db "+"
LEX_PLUS_len equ $ - LEX_PLUS
LEX_MINUS db "-"
LEX_MINUS_len equ $ - LEX_MINUS
LEX_RPARENS db ")"
LEX_RPARENS_len equ $ - LEX_RPARENS
LEX_LPARENS db "("
LEX_LPARENS_len equ $ - LEX_LPARENS
LEX_RBRACE db "}"
LEX_RBRACE_len equ $ - LEX_RBRACE
LEX_LBRACE db "{"
LEX_LBRACE_len equ $ - LEX_LBRACE
LEX_COLON db ":"
LEX_COLON_len equ $ - LEX_COLON
LEX_SEMI db ";"
LEX_SEMI_len equ $ - LEX_SEMI
LEX_COMMA db ","
LEX_COMMA_len equ $ - LEX_COMMA
LEX_PIPE db "|"
LEX_PIPE_len equ $ - LEX_PIPE
LEX_AMP db "&"
LEX_AMP_len equ $ - LEX_AMP
LEX_EQEQ db "=="
LEX_EQEQ_len equ $ - LEX_EQEQ
LEX_LBRACKET db "["
LEX_LBRACKET_len equ $ - LEX_LBRACKET
LEX_RBRACKET db "]"
LEX_RBRACKET_len equ $ - LEX_RBRACKET
LEX_VOID db "void"
LEX_VOID_len equ $ - LEX_VOID
LEX_IDENT db "<identifier>"
LEX_IDENT_len equ $ - LEX_IDENT
LEX_NUMBER db "<number>"
LEX_NUMBER_len equ $ - LEX_NUMBER
LEX_STRING db "<string>"
LEX_STRING_len equ $ - LEX_STRING
LEX_COMMENT db "<comment>"
LEX_COMMENT_len equ $ - LEX_COMMENT
;; start-consts
TOKEN_EOF equ 0
TOKEN_LET equ 1
TOKEN_IF equ 2
TOKEN_ELSE equ 3
TOKEN_FN equ 4
TOKEN_RETURN equ 5
TOKEN_LOOP equ 6
TOKEN_BREAK equ 7
TOKEN_CONTINUE equ 8
TOKEN_TRUE equ 9
TOKEN_FALSE equ 10
TOKEN_BOOL equ 11
TOKEN_ARROW equ 12
TOKEN_I32 equ 13
TOKEN_U32 equ 14
TOKEN_EQUALS equ 15
TOKEN_PLUS equ 16
TOKEN_MINUS equ 17
TOKEN_RPARENS equ 18
TOKEN_LPARENS equ 19
TOKEN_RBRACE equ 20
TOKEN_LBRACE equ 21
TOKEN_COLON equ 22
TOKEN_SEMI equ 23
TOKEN_COMMA equ 24
TOKEN_PIPE equ 25
TOKEN_AMP equ 26
TOKEN_EQEQ equ 27
TOKEN_LBRACKET equ 28
TOKEN_RBRACKET equ 29
TOKEN_VOID equ 30
TOKEN_IDENT equ 31
TOKEN_NUMBER equ 32
TOKEN_STRING equ 33
TOKEN_COMMENT equ 34
;; end-consts

310
lang/tests/asm_to_rust.py Executable file
View file

@ -0,0 +1,310 @@
#!/usr/bin/env python3
"""
parse_asm_to_rust.py
Scan one or more assembly source files and extract:
- commented struct definitions inside `start-structs` / `end-structs` spans
- constant definitions inside `start-consts` / `end-consts` spans
- commented function-definition directives of the form `define-fn: fn ...`
- commented markdown rust fenced code blocks (```rust) and copy their inner code
into the generated Rust output (fences are removed and comment markers stripped)
Produce Rust source code containing:
- an `extern "C"` block with `pub unsafe fn ...;` declarations for each define-fn
- `pub const NAME: u32 = <value>;` lines for each `equ` constant found in const spans
- `#[repr(C)] pub struct Name { pub field: Type, ... }` for each struct found in struct spans
- verbatim Rust code copied from commented ```rust``` blocks (fences removed)
Notes:
- Struct and function definitions must appear on commented lines. Any number of leading semicolons
(e.g. `;`, `;;`, `;;;`) and surrounding spaces are allowed and will be stripped.
- Constant lines inside const spans may be commented or not; the script strips leading semicolons
before parsing.
- Commented rust blocks are expected to use commented fenced code blocks, e.g.:
;; ```rust
;; extern "C" { ... }
;; ```
The inner lines will be uncommented (leading semicolons removed) and included in output.
- By default the script writes to stdout. Use `-o` to write combined output to a file, or `-d`
to write one .rs file per input with the same basename.
"""
import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple, Dict, Any
LEADING_COMMENT_RE = re.compile(r'^\s*;+\s*') # lines that start with one or more semicolons
START_STRUCTS_RE = re.compile(r'^\s*;+\s*start-structs\b', re.IGNORECASE)
END_STRUCTS_RE = re.compile(r'^\s*;+\s*end-structs\b', re.IGNORECASE)
START_CONSTS_RE = re.compile(r'^\s*;+\s*start-consts\b', re.IGNORECASE)
END_CONSTS_RE = re.compile(r'^\s*;+\s*end-consts\b', re.IGNORECASE)
DEFINE_FN_RE = re.compile(r'^\s*;+\s*define-fn:\s*(.+)$', re.IGNORECASE)
CONST_EQU_RE = re.compile(r'^\s*([A-Za-z_]\w*)\s+equ\s+(.+)$', re.IGNORECASE)
STRUCT_START_RE = re.compile(r'^\s*struct\s+([A-Za-z_]\w*)\s*\{') # after comment markers stripped
RUST_FENCE_RE = re.compile(r'^\s*```\s*(rust)?\s*$', re.IGNORECASE) # matches ``` or ```rust (after stripping leading comment)
def strip_leading_semicolons(line: str) -> str:
"""Remove leading semicolons and surrounding spaces from a commented line."""
return LEADING_COMMENT_RE.sub('', line).rstrip('\n')
def extract_structs_from_commented_lines(lines: List[str]) -> List[Tuple[str, List[str]]]:
"""
Given a list of lines (with comments already stripped of leading ';'), find all 'struct Name { ... }'
blocks. Return list of (name, field_lines).
This uses a simple brace-balanced scan so struct bodies can contain nested braces in types.
"""
structs = []
i = 0
n = len(lines)
while i < n:
m = STRUCT_START_RE.match(lines[i])
if m:
name = m.group(1)
body_lines = []
# Count braces: the opening brace on the start line
brace_level = lines[i].count('{') - lines[i].count('}')
i += 1
while i < n and brace_level > 0:
line = lines[i]
brace_level += line.count('{') - line.count('}')
body_lines.append(line)
i += 1
# Trim any trailing '}' line from body_lines if present
if body_lines and body_lines[-1].strip() == '}':
body_lines = body_lines[:-1]
structs.append((name, body_lines))
else:
i += 1
return structs
def format_rust_struct(name: str, field_lines: List[str]) -> str:
"""
Convert a list of field lines like ' nodes: Vec<AstNode>,' into a Rust struct with pub fields and #[repr(C)].
Minimal parsing: split each field on the first ':' to find name and type, otherwise preserve line.
"""
out_lines = []
out_lines.append('#[repr(C)]')
out_lines.append('#[derive(Debug)]')
out_lines.append(f'pub struct {name} {{')
for raw in field_lines:
line = raw.strip().rstrip(',')
if not line:
continue
if ':' in line:
parts = line.split(':', 1)
fname = parts[0].strip()
ftype = parts[1].strip()
out_lines.append(f' pub {fname}: {ftype},')
else:
out_lines.append(f' pub {line},')
out_lines.append('}')
return '\n'.join(out_lines)
def parse_file(path: Path) -> Dict[str, Any]:
"""
Parse a single assembly file and return dict with keys: 'functions', 'consts', 'structs', 'rust_blocks'
- functions: list of signature strings (e.g. "parse_ast(data: *const u8) -> Ast")
- consts: list of (name, value)
- structs: list of (name, field_lines)
- rust_blocks: list of rust code blocks; each block is list[str] of code lines (no fences, uncommented)
"""
functions: List[str] = []
consts: List[Tuple[str, str]] = []
structs: List[Tuple[str, List[str]]] = []
rust_blocks: List[List[str]] = []
with path.open('r', encoding='utf-8') as f:
lines = f.readlines()
i = 0
n = len(lines)
in_structs = False
in_consts = False
struct_buffer: List[str] = []
const_buffer: List[str] = []
while i < n:
raw = lines[i]
# state transitions for start/end spans
if not in_structs and START_STRUCTS_RE.match(raw):
in_structs = True
struct_buffer = []
i += 1
continue
if in_structs and END_STRUCTS_RE.match(raw):
stripped = [strip_leading_semicolons(l) for l in struct_buffer if l.strip()]
found = extract_structs_from_commented_lines(stripped)
structs.extend(found)
in_structs = False
struct_buffer = []
i += 1
continue
if not in_consts and START_CONSTS_RE.match(raw):
in_consts = True
const_buffer = []
i += 1
continue
if in_consts and END_CONSTS_RE.match(raw):
for l in const_buffer:
s = strip_leading_semicolons(l)
m = CONST_EQU_RE.match(s)
if m:
name = m.group(1)
value = m.group(2).strip()
consts.append((name, value))
in_consts = False
const_buffer = []
i += 1
continue
# If inside special spans, collect lines
if in_structs:
if LEADING_COMMENT_RE.match(raw):
struct_buffer.append(raw)
elif in_consts:
const_buffer.append(raw)
else:
# Top-level: look for define-fn directives (must be commented lines)
mfn = DEFINE_FN_RE.match(raw)
if mfn:
sig = mfn.group(1).strip()
if sig.startswith('fn '):
sig = sig[len('fn '):].strip()
functions.append(sig)
else:
# Check for commented rust fenced block start
if LEADING_COMMENT_RE.match(raw):
stripped = strip_leading_semicolons(raw)
if RUST_FENCE_RE.match(stripped):
# start collecting rust block until a closing fence is found
block_lines: List[str] = []
i += 1
while i < n:
cur = lines[i]
# If it's a commented fence closing, stop
if LEADING_COMMENT_RE.match(cur):
inner_stripped = strip_leading_semicolons(cur)
if RUST_FENCE_RE.match(inner_stripped):
break
# otherwise, this is a commented code line; strip leading semicolons and append
block_lines.append(strip_leading_semicolons(cur))
else:
# If it's an uncommented line inside the block, include as-is (trim newline)
block_lines.append(cur.rstrip('\n'))
i += 1
rust_blocks.append(block_lines)
# advance past the closing fence line if present
# current i points at closing fence or EOF; advance one to continue main loop
i += 1
continue # continue outer loop without incrementing i further
i += 1
return {
'functions': functions,
'consts': consts,
'structs': structs,
'rust_blocks': rust_blocks,
}
def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]],
structs: List[Tuple[str, List[str]]], rust_blocks: List[List[str]]) -> str:
parts: List[str] = []
parts.append('#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]')
parts.append('// Auto-generated Rust bindings from assembly source\n')
# Functions: wrap in single extern "C" block if any
if function_sigs:
parts.append('unsafe extern "C" {')
for sig in function_sigs:
parts.append(f' pub unsafe fn {sig};')
parts.append('}')
parts.append('') # blank line
# Consts
for name, value in consts:
parts.append(f'pub const {name}: u32 = {value};')
if consts:
parts.append('')
# Structs
for name, field_lines in structs:
parts.append(format_rust_struct(name, field_lines))
parts.append('') # blank line between structs
# Rust blocks copied verbatim (these are already uncommented and fence-less)
for block in rust_blocks:
# Ensure there's a blank line before inserted blocks for separation
if parts and parts[-1] != '':
parts.append('')
# append each line exactly as collected
parts.extend(line.rstrip('\n') for line in block)
parts.append('') # trailing blank line after block
# Trim trailing blank lines
while parts and parts[-1] == '':
parts.pop()
return '\n'.join(parts) + '\n' if parts else ''
def main(argv=None):
parser = argparse.ArgumentParser(description='Parse assembly files and emit Rust externs, consts, struct defs, and commented ```rust``` blocks.')
parser.add_argument('inputs', metavar='INPUT', type=Path, nargs='+', help='assembly source files to parse')
group = parser.add_mutually_exclusive_group()
group.add_argument('-o', '--out', type=Path, help='write combined Rust to this file (default stdout)')
group.add_argument('-d', '--out-dir', type=Path, help='write one .rs file per input into this directory')
args = parser.parse_args(argv)
combined_functions: List[str] = []
combined_consts: List[Tuple[str, str]] = []
combined_structs: List[Tuple[str, List[str]]] = []
combined_rust_blocks: List[List[str]] = []
per_file_output: Dict[Path, str] = {}
for inp in args.inputs:
if not inp.exists():
print(f'warning: input file {inp} does not exist, skipping', file=sys.stderr)
continue
parsed = parse_file(inp)
rust_src = render_rust(parsed['functions'], parsed['consts'], parsed['structs'], parsed['rust_blocks'])
per_file_output[inp] = rust_src
combined_functions.extend(parsed['functions'])
combined_consts.extend(parsed['consts'])
combined_structs.extend(parsed['structs'])
combined_rust_blocks.extend(parsed['rust_blocks'])
if args.out_dir:
outdir = args.out_dir
outdir.mkdir(parents=True, exist_ok=True)
for inp, src in per_file_output.items():
outpath = outdir / (inp.stem + '.rs')
with outpath.open('w', encoding='utf-8') as f:
f.write(src)
print(f'Wrote {outpath}', file=sys.stderr)
return 0
combined_src = render_rust(combined_functions, combined_consts, combined_structs, combined_rust_blocks)
if args.out:
with args.out.open('w', encoding='utf-8') as f:
f.write(combined_src)
print(f'Wrote {args.out}', file=sys.stderr)
else:
sys.stdout.write(combined_src)
return 0
if __name__ == '__main__':
raise SystemExit(main())

66
lang/tests/ast.rs Normal file
View file

@ -0,0 +1,66 @@
#[path = "shared/shared.rs"]
mod util;
unsafe extern "C" {
unsafe fn bump_init();
unsafe fn tokeniser_init_buf(bytes: *const u8, len: usize) -> ();
}
use util::defs::{parse_expr, Ast, AstNode};
fn main() {
unsafe {
bump_init();
}
println!("Bump allocator initialized.");
let src = b"3 + 4";
unsafe {
tokeniser_init_buf(src.as_ptr(), src.len());
let mut ast = Ast {
nodes: util::vec::Vec::new(),
};
let expr_id = parse_expr(&mut ast);
println!("Parsed expression with ID: {}", expr_id);
println!("{:#}", &ast);
}
}
impl std::fmt::Display for AstNode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use util::defs::{BinaryExpr, AST_BINARY_OP, AST_NUMBER};
match self.kind as u32 {
AST_NUMBER => {
write!(f, "Number({})", self.data as usize)
}
AST_BINARY_OP => {
let BinaryExpr {
left,
operator,
right,
} = unsafe { self.data.cast::<util::defs::BinaryExpr>().read() };
write!(
f,
"BinaryOp(op: {}, left: {}, right: {})",
operator, left, right
)
}
_ => write!(f, "UnknownNode"),
}
}
}
impl core::fmt::Display for Ast {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
writeln!(f, "[")?;
for (i, item) in self.nodes.as_slice().iter().enumerate() {
if i > 0 {
writeln!(f, ", ")?;
}
write!(f, "\t{i}: {}", item)?;
}
writeln!(f, "\n]")
}
}

View file

@ -1,9 +1,7 @@
#![feature(allocator_api, box_as_ptr)]
#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
panic!("Called panic from external code.");
}
#[path = "shared/shared.rs"]
mod util;
unsafe extern "C" {
unsafe fn bump_init();

View file

@ -1,22 +1,7 @@
#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
panic!("Called panic from external code.");
}
#[path = "shared/shared.rs"]
mod util;
#[repr(C)]
struct FFISlice {
ptr: *const u8,
len: usize,
}
impl FFISlice {
fn as_slice(&self) -> &[u8] {
unsafe { core::slice::from_raw_parts(self.ptr, self.len) }
}
fn as_str(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(self.as_slice()) }
}
}
use util::FFISlice;
unsafe extern "C" {
unsafe fn int_to_str2(value: isize, buffer: *mut u8, buffer_len: usize, radix: u8) -> FFISlice;

104
lang/tests/shared/defs.rs Normal file
View file

@ -0,0 +1,104 @@
#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]
// Auto-generated Rust bindings from assembly source
unsafe extern "C" {
pub unsafe fn parse_func(ast: *mut Ast) -> u64;
pub unsafe fn parse_args(ast: *mut Ast) -> (*const Argument, usize);
pub unsafe fn parse_primary_expr(ast: *mut Ast) -> u64;
pub unsafe fn parse_binary_expr(ast: *mut Ast, precedence: u8) -> u64;
pub unsafe fn parse_expr(ast: *mut Ast) -> u64;
pub unsafe fn parse_statement(ast: *mut Ast) -> u64;
pub unsafe fn parse_block(ast: *mut Ast) -> u64;
pub unsafe fn parse_type(ast: *mut Ast) -> Type;
}
pub const AST_FUNCTION: u32 = 1;
pub const AST_BLOCK: u32 = 2;
pub const AST_VARIABLE: u32 = 3;
pub const AST_NUMBER: u32 = 4;
pub const AST_BINARY_OP: u32 = 5;
pub const AST_RETURN_STATEMENT: u32 = 6;
pub const TYPE_VOID: u32 = 1;
pub const TYPE_BOOL: u32 = 2;
pub const TYPE_I32: u32 = 3;
pub const TYPE_U32: u32 = 4;
pub const TYPE_STR: u32 = 5;
pub const TOKEN_EOF: u32 = 0;
pub const TOKEN_LET: u32 = 1;
pub const TOKEN_IF: u32 = 2;
pub const TOKEN_ELSE: u32 = 3;
pub const TOKEN_FN: u32 = 4;
pub const TOKEN_RETURN: u32 = 5;
pub const TOKEN_LOOP: u32 = 6;
pub const TOKEN_BREAK: u32 = 7;
pub const TOKEN_CONTINUE: u32 = 8;
pub const TOKEN_TRUE: u32 = 9;
pub const TOKEN_FALSE: u32 = 10;
pub const TOKEN_BOOL: u32 = 11;
pub const TOKEN_ARROW: u32 = 12;
pub const TOKEN_I32: u32 = 13;
pub const TOKEN_U32: u32 = 14;
pub const TOKEN_EQUALS: u32 = 15;
pub const TOKEN_PLUS: u32 = 16;
pub const TOKEN_MINUS: u32 = 17;
pub const TOKEN_RPARENS: u32 = 18;
pub const TOKEN_LPARENS: u32 = 19;
pub const TOKEN_RBRACE: u32 = 20;
pub const TOKEN_LBRACE: u32 = 21;
pub const TOKEN_COLON: u32 = 22;
pub const TOKEN_SEMI: u32 = 23;
pub const TOKEN_COMMA: u32 = 24;
pub const TOKEN_PIPE: u32 = 25;
pub const TOKEN_AMP: u32 = 26;
pub const TOKEN_EQEQ: u32 = 27;
pub const TOKEN_LBRACKET: u32 = 28;
pub const TOKEN_RBRACKET: u32 = 29;
pub const TOKEN_VOID: u32 = 30;
pub const TOKEN_IDENT: u32 = 31;
pub const TOKEN_NUMBER: u32 = 32;
pub const TOKEN_STRING: u32 = 33;
pub const TOKEN_COMMENT: u32 = 34;
#[repr(C)]
#[derive(Debug)]
pub struct Ast {
pub nodes: Vec<AstNode>,
}
#[repr(C)]
#[derive(Debug)]
pub struct AstNode {
pub kind: u8,
pub data: *const (),
}
#[repr(C)]
#[derive(Debug)]
pub struct Argument {
pub name: *const u8,
pub name_len: usize,
pub arg_type: Type,
}
#[repr(C)]
#[derive(Debug)]
pub struct Type {
pub kind: u8,
}
#[repr(C)]
#[derive(Debug)]
pub struct BinaryExpr {
pub left: u64,
pub operator: u8,
pub right: u64,
}
#[repr(C)]
#[derive(Debug)]
pub struct Block {
pub statements: *const u64,
pub statements_len: usize,
}
use super::vec::Vec;

355
lang/tests/shared/shared.rs Normal file
View file

@ -0,0 +1,355 @@
#![allow(dead_code)]
#[path = "defs.rs"]
pub mod defs;
#[inline(never)]
fn __do_panic() -> ! {
panic!("Called panic from external code.");
}
#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
__do_panic()
}
#[repr(C)]
#[derive(Debug, PartialEq, Eq)]
pub struct FFISlice {
pub ptr: *const u8,
pub len: usize,
}
#[repr(transparent)]
#[derive(Debug, PartialEq, Eq)]
pub struct MaybeFFISlice {
inner: FFISlice,
}
impl MaybeFFISlice {
pub fn is_none(&self) -> bool {
self.inner.ptr.is_null()
}
pub fn into_option(self) -> Option<FFISlice> {
if self.is_none() {
None
} else {
Some(self.inner)
}
}
}
impl FFISlice {
pub unsafe fn as_slice<T: Sized>(&self) -> &[T] {
unsafe { core::slice::from_raw_parts(self.ptr.cast(), self.len) }
}
pub unsafe fn as_bytes(&self) -> &[u8] {
unsafe { core::slice::from_raw_parts(self.ptr, self.len) }
}
pub unsafe fn as_str(&self) -> &str {
unsafe { core::str::from_utf8_unchecked(self.as_bytes()) }
}
}
#[repr(C)]
#[derive(Debug)]
pub struct BlobVec {
pub data: *mut u8,
pub len: usize,
pub cap: usize,
pub elem_size: usize,
pub drop: Option<extern "C" fn(*mut u8)>,
}
impl Default for BlobVec {
fn default() -> Self {
Self {
data: core::ptr::null_mut(),
len: 0,
cap: 0,
elem_size: 0,
drop: None,
}
}
}
unsafe impl Send for BlobVec {}
unsafe impl Sync for BlobVec {}
pub mod vec {
#![allow(dead_code)]
use super::ffi::*;
use super::*;
#[repr(transparent)]
#[derive(Debug)]
pub struct Vec<T> {
pub vec: BlobVec,
_marker: core::marker::PhantomData<T>,
}
impl<T> Vec<T> {
pub fn new() -> Self {
Self::new_with(32)
}
pub fn new_with(capacity: usize) -> Self {
let mut vec = BlobVec {
data: core::ptr::null_mut(),
len: 0,
cap: 0,
elem_size: 0,
drop: None,
};
extern "C" fn drop_fn<T>(ptr: *mut u8) {
unsafe {
core::ptr::drop_in_place::<T>(ptr as *mut T);
}
}
unsafe {
vec_init_with(
&mut vec,
core::mem::size_of::<T>(),
Some(drop_fn::<T>),
capacity,
);
}
Self {
vec,
_marker: core::marker::PhantomData,
}
}
pub fn as_slice(&self) -> &[T] {
assert_eq!(self.vec.elem_size, core::mem::size_of::<T>());
unsafe { core::slice::from_raw_parts(self.vec.data as *const T, self.vec.len) }
}
pub fn as_slice_mut(&mut self) -> &mut [T] {
assert_eq!(self.vec.elem_size, core::mem::size_of::<T>());
unsafe { core::slice::from_raw_parts_mut(self.vec.data as *mut T, self.vec.len) }
}
pub fn push(&mut self, value: T) {
let value = core::mem::ManuallyDrop::new(value);
unsafe {
vec_push(&mut self.vec, &raw const value as *const T as *const u8);
}
}
pub fn insert(&mut self, value: T, index: usize) {
if index > self.vec.len {
return;
}
let value = core::mem::ManuallyDrop::new(value);
unsafe {
vec_insert(
&mut self.vec,
index,
&raw const value as *const T as *const u8,
);
}
}
pub fn pop(&mut self) -> Option<T> {
if self.vec.len == 0 {
return None;
}
unsafe {
let ptr = vec_get(&mut self.vec, self.vec.len - 1) as *mut T;
let value = ptr.read();
vec_pop(&mut self.vec);
Some(value)
}
}
pub fn get(&self, index: usize) -> Option<&T> {
if index >= self.vec.len {
return None;
}
unsafe {
let ptr = vec_get(&raw const self.vec as *mut _, index) as *mut T;
Some(&*ptr)
}
}
pub fn get_mut(&mut self, index: usize) -> Option<&mut T> {
if index >= self.vec.len {
return None;
}
unsafe {
let ptr = vec_get(&raw mut self.vec, index) as *mut T;
Some(&mut *ptr)
}
}
pub fn remove(&mut self, index: usize) {
if index >= self.vec.len {
return;
}
unsafe {
vec_remove(&mut self.vec, index);
}
}
pub fn len(&self) -> usize {
self.vec.len
}
pub fn position<F>(&self, elem: &T, mut cmp: F) -> Option<usize>
where
F: FnMut(&T, &T) -> bool,
{
extern "C" fn cmp_trampoline<T, F: FnMut(&T, &T) -> bool>(
f: *const (),
a: *const u8,
b: *const u8,
) -> bool {
let f = unsafe { &mut *(f as *mut F) };
let a = unsafe { &*(a as *const T) };
let b = unsafe { &*(b as *const T) };
f(a, b)
}
unsafe {
let index = vec_find(
&raw const self.vec as *mut _,
elem as *const T as *const u8,
cmp_trampoline::<T, F>,
&raw mut cmp as *mut F as *mut (),
);
if index == usize::MAX {
None
} else {
Some(index)
}
}
}
pub fn binary_search_by<F>(&self, elem: &T, mut cmp: F) -> Result<usize, usize>
where
F: FnMut(&T, &T) -> i32,
{
extern "C" fn cmp_trampoline<T, F: FnMut(&T, &T) -> i32>(
f: *const (),
a: *const u8,
b: *const u8,
) -> i32 {
let f = unsafe { &mut *(f as *mut F) };
let a = unsafe { &*(a as *const T) };
let b = unsafe { &*(b as *const T) };
f(a, b)
}
unsafe {
let (index, vacant) = vec_binary_search_by(
&raw const self.vec as *mut _,
elem as *const T as *const u8,
cmp_trampoline::<T, F>,
&raw mut cmp as *mut F as *mut (),
);
if vacant {
Err(index)
} else {
Ok(index)
}
}
}
pub fn insert_sorted<F>(&self, elem: T, mut cmp: F) -> Result<usize, usize>
where
F: FnMut(&T, &T) -> i32,
{
extern "C" fn cmp_trampoline<T, F: FnMut(&T, &T) -> i32>(
f: *const (),
a: *const u8,
b: *const u8,
) -> i32 {
let f = unsafe { &mut *(f as *mut F) };
let a = unsafe { &*(a as *const T) };
let b = unsafe { &*(b as *const T) };
f(a, b)
}
let mut elem = core::mem::ManuallyDrop::new(elem);
unsafe {
let (index, _inserted) = vec_insert_sorted(
&raw const self.vec as *mut _,
&raw mut elem as *const u8,
cmp_trampoline::<T, F>,
&raw mut cmp as *mut F as *mut (),
);
Ok(index)
}
}
}
}
pub mod ffi {
#![allow(improper_ctypes)]
use super::*;
#[allow(dead_code)]
unsafe extern "C" {
pub unsafe fn vec_init(
vec: *mut BlobVec,
elem_size: usize,
drop: Option<extern "C" fn(*mut u8)>,
);
pub unsafe fn vec_init_with(
vec: *mut BlobVec,
elem_size: usize,
drop: Option<extern "C" fn(*mut u8)>,
cap: usize,
);
pub unsafe fn vec_push(vec: *mut BlobVec, elem: *const u8);
pub unsafe fn vec_insert(vec: *mut BlobVec, index: usize, elem: *const u8);
pub unsafe fn vec_pop(vec: *mut BlobVec);
pub unsafe fn vec_drop_last(vec: *mut BlobVec);
pub unsafe fn vec_get(vec: *mut BlobVec, index: usize) -> *mut u8;
pub unsafe fn vec_remove(vec: *mut BlobVec, index: usize);
pub unsafe fn vec_drop(vec: *mut BlobVec);
pub unsafe fn vec_find(
vec: *mut BlobVec,
elem: *const u8,
cmp: extern "C" fn(*const (), *const u8, *const u8) -> bool,
cmp_data: *mut (),
) -> usize;
pub unsafe fn vec_binary_search_by(
vec: *mut BlobVec,
elem: *const u8,
cmp: extern "C" fn(*const (), *const u8, *const u8) -> i32,
cmp_data: *mut (),
) -> (usize, bool);
pub unsafe fn vec_insert_sorted(
vec: *mut BlobVec,
elem: *const u8,
cmp: extern "C" fn(*const (), *const u8, *const u8) -> i32,
cmp_data: *mut (),
) -> (usize, bool);
}
}
pub struct DisplaySlice<'a, T>(pub &'a [T]);
impl<'a, T: core::fmt::Display> core::fmt::Display for DisplaySlice<'a, T> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(f, "[")?;
for (i, item) in self.0.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{}", item)?;
}
write!(f, "]")
}
}

View file

@ -1,7 +1,7 @@
#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
panic!("Called panic from external code.");
}
#[path = "shared/shared.rs"]
mod util;
use util::*;
#[derive(Debug)]
struct Lexeme(u8, &'static str);
@ -18,12 +18,6 @@ impl PartialEq for Lexeme {
impl Eq for Lexeme {}
impl Lexeme {
fn lex(&self) -> &'static str {
self.1
}
}
trait AsLexeme {
fn as_lexeme(self) -> Option<Lexeme>;
}
@ -49,12 +43,15 @@ impl AsLexeme for LexemeRaw {
#[allow(dead_code)]
unsafe extern "C" {
unsafe fn tokeniser_init(path: *const i8) -> ();
unsafe fn tokeniser_init_buf(bytes: *const u8, len: usize) -> ();
unsafe fn tokeniser_print() -> ();
unsafe fn is_ident(len: usize) -> bool;
unsafe fn is_number(len: usize) -> bool;
unsafe fn skip_whitespace() -> ();
unsafe fn find_lexeme() -> LexemeRaw;
unsafe fn expect_token(token: u8) -> MaybeFFISlice;
unsafe fn unwrap_token(token: u8) -> FFISlice;
static mut LEXEMES: *const u8;
static mut LEXEME_LENS: usize;
@ -137,16 +134,16 @@ fn main() {
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(30, "this-is-an-ident"),
Lexeme(30, "another_ident123"),
Lexeme(30, "_underscore_test"),
Lexeme(30, "mixedCASEIdent"),
Lexeme(30, "number12345"),
Lexeme(30, "____"),
Lexeme(30, "_"),
Lexeme(31, "this-is-an-ident"),
Lexeme(31, "another_ident123"),
Lexeme(31, "_underscore_test"),
Lexeme(31, "mixedCASEIdent"),
Lexeme(31, "number12345"),
Lexeme(31, "____"),
Lexeme(31, "_"),
Lexeme(17, ""),
Lexeme(30, "leading-minus"),
Lexeme(30, "trailing-minus-"),
Lexeme(31, "leading-minus"),
Lexeme(31, "trailing-minus-"),
]
);
@ -158,7 +155,7 @@ fn main() {
&collect_tokens()[..],
&[
Lexeme(4, ""),
Lexeme(30, "my-function"),
Lexeme(31, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
@ -171,6 +168,14 @@ fn main() {
]
);
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/function.l".as_ptr());
eprintln!("ok.");
assert_eq!(expect_token(2).into_option(), None);
assert_eq!(expect_token(4).into_option().unwrap().as_str(), "fn");
assert_eq!(unwrap_token(31).as_str(), "my-function");
eprint!("Initializing tokeniser.. ");
tokeniser_init(c"tests/tokens/comment.l".as_ptr());
eprintln!("ok.");
@ -178,15 +183,15 @@ fn main() {
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(33, ""),
Lexeme(34, ""),
Lexeme(4, ""),
Lexeme(30, "my-function"),
Lexeme(31, "my-function"),
Lexeme(19, ""),
Lexeme(18, ""),
Lexeme(12, ""),
Lexeme(11, ""),
Lexeme(21, ""),
Lexeme(33, ""),
Lexeme(34, ""),
Lexeme(5, ""),
Lexeme(10, ""),
Lexeme(23, ""),
@ -201,11 +206,11 @@ fn main() {
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(31, "1234"),
Lexeme(31, "123_345_"),
Lexeme(31, "1234____56"),
Lexeme(31, "1"),
Lexeme(31, "0"),
Lexeme(32, "1234"),
Lexeme(32, "123_345_"),
Lexeme(32, "1234____56"),
Lexeme(32, "1"),
Lexeme(32, "0"),
]
);
@ -216,14 +221,24 @@ fn main() {
assert_eq!(
&collect_tokens()[..],
&[
Lexeme(32, "\"this is a string\""),
Lexeme(32, "\"another\nstring\nspanning multiple\n lines\""),
Lexeme(32, "\"string with a \\\"quoted\\\" word\""),
Lexeme(32, "\"a\""),
Lexeme(32, "\"\"")
Lexeme(33, "\"this is a string\""),
Lexeme(33, "\"another\nstring\nspanning multiple\n lines\""),
Lexeme(33, "\"string with a \\\"quoted\\\" word\""),
Lexeme(33, "\"a\""),
Lexeme(33, "\"\"")
],
);
eprint!("Initializing tokeniser.. ");
let src = b"3 + 4";
tokeniser_init_buf(src.as_ptr(), src.len());
eprintln!("ok.");
assert_eq!(
&collect_tokens()[..],
&[Lexeme(32, "3"), Lexeme(16, "+"), Lexeme(32, "4")],
);
eprintln!("Finished tokenising.");
}
}

View file

@ -1,263 +1,7 @@
#[repr(C)]
pub struct BlobVec {
pub data: *mut u8,
pub len: usize,
pub cap: usize,
pub elem_size: usize,
pub drop: Option<extern "C" fn(*mut u8)>,
}
#[path = "shared/shared.rs"]
mod util;
struct VecT<T> {
vec: BlobVec,
_marker: core::marker::PhantomData<T>,
}
impl<T> VecT<T> {
fn new() -> Self {
Self::new_with(32)
}
fn new_with(capacity: usize) -> Self {
let mut vec = BlobVec {
data: core::ptr::null_mut(),
len: 0,
cap: 0,
elem_size: 0,
drop: None,
};
extern "C" fn drop_fn<T>(ptr: *mut u8) {
unsafe {
core::ptr::drop_in_place::<T>(ptr as *mut T);
}
}
unsafe {
vec_init_with(
&mut vec,
core::mem::size_of::<T>(),
Some(drop_fn::<T>),
capacity,
);
}
Self {
vec,
_marker: core::marker::PhantomData,
}
}
fn as_slice(&self) -> &[T] {
assert_eq!(self.vec.elem_size, core::mem::size_of::<T>());
unsafe { core::slice::from_raw_parts(self.vec.data as *const T, self.vec.len) }
}
fn as_slice_mut(&mut self) -> &mut [T] {
assert_eq!(self.vec.elem_size, core::mem::size_of::<T>());
unsafe { core::slice::from_raw_parts_mut(self.vec.data as *mut T, self.vec.len) }
}
fn push(&mut self, value: T) {
let value = core::mem::ManuallyDrop::new(value);
unsafe {
vec_push(&mut self.vec, &raw const value as *const T as *const u8);
}
}
fn insert(&mut self, value: T, index: usize) {
if index > self.vec.len {
return;
}
let value = core::mem::ManuallyDrop::new(value);
unsafe {
vec_insert(
&mut self.vec,
index,
&raw const value as *const T as *const u8,
);
}
}
fn pop(&mut self) -> Option<T> {
if self.vec.len == 0 {
return None;
}
unsafe {
let ptr = vec_get(&mut self.vec, self.vec.len - 1) as *mut T;
let value = ptr.read();
vec_pop(&mut self.vec);
Some(value)
}
}
fn get(&self, index: usize) -> Option<&T> {
if index >= self.vec.len {
return None;
}
unsafe {
let ptr = vec_get(&raw const self.vec as *mut _, index) as *mut T;
Some(&*ptr)
}
}
fn get_mut(&mut self, index: usize) -> Option<&mut T> {
if index >= self.vec.len {
return None;
}
unsafe {
let ptr = vec_get(&raw mut self.vec, index) as *mut T;
Some(&mut *ptr)
}
}
fn remove(&mut self, index: usize) {
if index >= self.vec.len {
return;
}
unsafe {
vec_remove(&mut self.vec, index);
}
}
fn len(&self) -> usize {
self.vec.len
}
fn position<F>(&self, elem: &T, mut cmp: F) -> Option<usize>
where
F: FnMut(&T, &T) -> bool,
{
extern "C" fn cmp_trampoline<T, F: FnMut(&T, &T) -> bool>(
f: *const (),
a: *const u8,
b: *const u8,
) -> bool {
let f = unsafe { &mut *(f as *mut F) };
let a = unsafe { &*(a as *const T) };
let b = unsafe { &*(b as *const T) };
f(a, b)
}
unsafe {
let index = vec_find(
&raw const self.vec as *mut _,
elem as *const T as *const u8,
cmp_trampoline::<T, F>,
&raw mut cmp as *mut F as *mut (),
);
if index == usize::MAX {
None
} else {
Some(index)
}
}
}
fn binary_search_by<F>(&self, elem: &T, mut cmp: F) -> Result<usize, usize>
where
F: FnMut(&T, &T) -> i32,
{
extern "C" fn cmp_trampoline<T, F: FnMut(&T, &T) -> i32>(
f: *const (),
a: *const u8,
b: *const u8,
) -> i32 {
let f = unsafe { &mut *(f as *mut F) };
let a = unsafe { &*(a as *const T) };
let b = unsafe { &*(b as *const T) };
f(a, b)
}
unsafe {
let (index, vacant) = vec_binary_search_by(
&raw const self.vec as *mut _,
elem as *const T as *const u8,
cmp_trampoline::<T, F>,
&raw mut cmp as *mut F as *mut (),
);
if vacant {
Err(index)
} else {
Ok(index)
}
}
}
fn insert_sorted<F>(&self, elem: T, mut cmp: F) -> Result<usize, usize>
where
F: FnMut(&T, &T) -> i32,
{
extern "C" fn cmp_trampoline<T, F: FnMut(&T, &T) -> i32>(
f: *const (),
a: *const u8,
b: *const u8,
) -> i32 {
let f = unsafe { &mut *(f as *mut F) };
let a = unsafe { &*(a as *const T) };
let b = unsafe { &*(b as *const T) };
f(a, b)
}
let mut elem = core::mem::ManuallyDrop::new(elem);
unsafe {
let (index, inserted) = vec_insert_sorted(
&raw const self.vec as *mut _,
&raw const elem as *const u8,
cmp_trampoline::<T, F>,
&raw mut cmp as *mut F as *mut (),
);
Ok(index)
}
}
}
#[unsafe(no_mangle)]
extern "C" fn panic() -> ! {
panic!("Called panic from external code.");
}
unsafe impl Send for BlobVec {}
unsafe impl Sync for BlobVec {}
unsafe extern "C" {
unsafe fn vec_init(vec: *mut BlobVec, elem_size: usize, drop: Option<extern "C" fn(*mut u8)>);
unsafe fn vec_init_with(
vec: *mut BlobVec,
elem_size: usize,
drop: Option<extern "C" fn(*mut u8)>,
cap: usize,
);
unsafe fn vec_push(vec: *mut BlobVec, elem: *const u8);
unsafe fn vec_insert(vec: *mut BlobVec, index: usize, elem: *const u8);
unsafe fn vec_pop(vec: *mut BlobVec);
unsafe fn vec_drop_last(vec: *mut BlobVec);
unsafe fn vec_get(vec: *mut BlobVec, index: usize) -> *mut u8;
#[allow(dead_code)]
unsafe fn vec_remove(vec: *mut BlobVec, index: usize);
#[allow(dead_code)]
unsafe fn vec_drop(vec: *mut BlobVec);
unsafe fn vec_find(
vec: *mut BlobVec,
elem: *const u8,
cmp: extern "C" fn(*const (), *const u8, *const u8) -> bool,
cmp_data: *mut (),
) -> usize;
unsafe fn vec_binary_search_by(
vec: *mut BlobVec,
elem: *const u8,
cmp: extern "C" fn(*const (), *const u8, *const u8) -> i32,
cmp_data: *mut (),
) -> (usize, bool);
unsafe fn vec_insert_sorted(
vec: *mut BlobVec,
elem: *const u8,
cmp: extern "C" fn(*const (), *const u8, *const u8) -> i32,
cmp_data: *mut (),
) -> (usize, bool);
}
use util::{ffi::*, vec::Vec, BlobVec};
fn main() {
static mut DROPS: usize = 1;
@ -318,7 +62,7 @@ fn main() {
eprintln!("Push/pop test passed\n");
}
let mut vec = VecT::<u32>::new_with(100);
let mut vec = Vec::<u32>::new_with(100);
assert_eq!(vec.len(), 0);
vec.push(10);
vec.push(20);
@ -358,6 +102,6 @@ fn main() {
assert_eq!(vec.binary_search_by(&5, cmp), Err(0));
assert_eq!(vec.binary_search_by(&55, cmp), Err(4));
vec.insert_sorted(35, cmp);
_ = vec.insert_sorted(35, cmp);
assert_eq!(vec.as_slice(), &[20, 30, 35, 40, 50]);
}