update python script, add slash and star tokens

This commit is contained in:
janis 2025-10-30 00:11:46 +01:00
parent 886525cf7e
commit 838c96f04f
Signed by: janis
SSH key fingerprint: SHA256:bB1qbbqmDXZNT0KKD5c2Dfjg53JGhj7B3CFcLIzSqq8
4 changed files with 133 additions and 104 deletions

View file

@ -32,6 +32,8 @@ LEXEMES:
dq LEX_LBRACKET dq LEX_LBRACKET
dq LEX_RBRACKET dq LEX_RBRACKET
dq LEX_VOID dq LEX_VOID
dq LEX_SLASH
dq LEX_STAR
align 8 align 8
TOKENS: TOKENS:
@ -66,6 +68,8 @@ TOKENS:
db TOKEN_LBRACKET ;; 28 db TOKEN_LBRACKET ;; 28
db TOKEN_RBRACKET ;; 29 db TOKEN_RBRACKET ;; 29
db TOKEN_VOID ;; 30 db TOKEN_VOID ;; 30
db TOKEN_SLASH ;; 31
db TOKEN_STAR ;; 32
align 8 align 8
LEXEME_LENS: LEXEME_LENS:
@ -100,9 +104,11 @@ LEXEME_LENS:
dq LEX_LBRACKET_len dq LEX_LBRACKET_len
dq LEX_RBRACKET_len dq LEX_RBRACKET_len
dq LEX_VOID_len dq LEX_VOID_len
dq LEX_SLASH_len
dq LEX_STAR_len
align 8 align 8
NUM_LEXEMES: dq 31 NUM_LEXEMES: dq 33
LEX_NOT_A_LEXEME db "<not a lexeme>", 0 LEX_NOT_A_LEXEME db "<not a lexeme>", 0
LEX_LET db "let" LEX_LET db "let"
@ -165,6 +171,10 @@ NUM_LEXEMES: dq 31
LEX_RBRACKET_len equ $ - LEX_RBRACKET LEX_RBRACKET_len equ $ - LEX_RBRACKET
LEX_VOID db "void" LEX_VOID db "void"
LEX_VOID_len equ $ - LEX_VOID LEX_VOID_len equ $ - LEX_VOID
LEX_SLASH db "/"
LEX_SLASH_len equ $ - LEX_SLASH
LEX_STAR db "*"
LEX_STAR_len equ $ - LEX_STAR
LEX_IDENT db "<identifier>" LEX_IDENT db "<identifier>"
LEX_IDENT_len equ $ - LEX_IDENT LEX_IDENT_len equ $ - LEX_IDENT
LEX_NUMBER db "<number>" LEX_NUMBER db "<number>"
@ -175,39 +185,41 @@ NUM_LEXEMES: dq 31
LEX_COMMENT_len equ $ - LEX_COMMENT LEX_COMMENT_len equ $ - LEX_COMMENT
;; start-consts ;; start-consts
TOKEN_EOF equ 0 TOKEN_EOF equ 0 ; :u8
TOKEN_LET equ 1 TOKEN_LET equ 1 ; :u8
TOKEN_IF equ 2 TOKEN_IF equ 2 ; :u8
TOKEN_ELSE equ 3 TOKEN_ELSE equ 3 ; :u8
TOKEN_FN equ 4 TOKEN_FN equ 4 ; :u8
TOKEN_RETURN equ 5 TOKEN_RETURN equ 5 ; :u8
TOKEN_LOOP equ 6 TOKEN_LOOP equ 6 ; :u8
TOKEN_BREAK equ 7 TOKEN_BREAK equ 7 ; :u8
TOKEN_CONTINUE equ 8 TOKEN_CONTINUE equ 8 ; :u8
TOKEN_TRUE equ 9 TOKEN_TRUE equ 9 ; :u8
TOKEN_FALSE equ 10 TOKEN_FALSE equ 10 ; :u8
TOKEN_BOOL equ 11 TOKEN_BOOL equ 11 ; :u8
TOKEN_ARROW equ 12 TOKEN_ARROW equ 12 ; :u8
TOKEN_I32 equ 13 TOKEN_I32 equ 13 ; :u8
TOKEN_U32 equ 14 TOKEN_U32 equ 14 ; :u8
TOKEN_EQUALS equ 15 TOKEN_EQUALS equ 15 ; :u8
TOKEN_PLUS equ 16 TOKEN_PLUS equ 16 ; :u8
TOKEN_MINUS equ 17 TOKEN_MINUS equ 17 ; :u8
TOKEN_RPARENS equ 18 TOKEN_RPARENS equ 18 ; :u8
TOKEN_LPARENS equ 19 TOKEN_LPARENS equ 19 ; :u8
TOKEN_RBRACE equ 20 TOKEN_RBRACE equ 20 ; :u8
TOKEN_LBRACE equ 21 TOKEN_LBRACE equ 21 ; :u8
TOKEN_COLON equ 22 TOKEN_COLON equ 22 ; :u8
TOKEN_SEMI equ 23 TOKEN_SEMI equ 23 ; :u8
TOKEN_COMMA equ 24 TOKEN_COMMA equ 24 ; :u8
TOKEN_PIPE equ 25 TOKEN_PIPE equ 25 ; :u8
TOKEN_AMP equ 26 TOKEN_AMP equ 26 ; :u8
TOKEN_EQEQ equ 27 TOKEN_EQEQ equ 27 ; :u8
TOKEN_LBRACKET equ 28 TOKEN_LBRACKET equ 28 ; :u8
TOKEN_RBRACKET equ 29 TOKEN_RBRACKET equ 29 ; :u8
TOKEN_VOID equ 30 TOKEN_VOID equ 30 ; :u8
TOKEN_IDENT equ 31 TOKEN_SLASH equ 31 ; :u8
TOKEN_NUMBER equ 32 TOKEN_STAR equ 32 ; :u8
TOKEN_STRING equ 33 TOKEN_IDENT equ 33 ; :u8
TOKEN_COMMENT equ 34 TOKEN_NUMBER equ 34 ; :u8
TOKEN_STRING equ 35 ; :u8
TOKEN_COMMENT equ 36 ; :u8
;; end-consts ;; end-consts

View file

@ -44,6 +44,7 @@ DEFINE_FN_RE = re.compile(r'^\s*;+\s*define-fn:\s*(.+)$', re.IGNORECASE)
CONST_EQU_RE = re.compile(r'^\s*([A-Za-z_]\w*)\s+equ\s+(.+)$', re.IGNORECASE) CONST_EQU_RE = re.compile(r'^\s*([A-Za-z_]\w*)\s+equ\s+(.+)$', re.IGNORECASE)
STRUCT_START_RE = re.compile(r'^\s*struct\s+([A-Za-z_]\w*)\s*\{') # after comment markers stripped STRUCT_START_RE = re.compile(r'^\s*struct\s+([A-Za-z_]\w*)\s*\{') # after comment markers stripped
RUST_FENCE_RE = re.compile(r'^\s*```\s*(rust)?\s*$', re.IGNORECASE) # matches ``` or ```rust (after stripping leading comment) RUST_FENCE_RE = re.compile(r'^\s*```\s*(rust)?\s*$', re.IGNORECASE) # matches ``` or ```rust (after stripping leading comment)
TYPE_ANNOT_RE = re.compile(r':\s*([A-Za-z0-9_\<\>\*\s\[\]\:&]+)') # matches :u8, : *const u8, Vec<T>, etc.
def strip_leading_semicolons(line: str) -> str: def strip_leading_semicolons(line: str) -> str:
@ -110,12 +111,12 @@ def parse_file(path: Path) -> Dict[str, Any]:
""" """
Parse a single assembly file and return dict with keys: 'functions', 'consts', 'structs', 'rust_blocks' Parse a single assembly file and return dict with keys: 'functions', 'consts', 'structs', 'rust_blocks'
- functions: list of signature strings (e.g. "parse_ast(data: *const u8) -> Ast") - functions: list of signature strings (e.g. "parse_ast(data: *const u8) -> Ast")
- consts: list of (name, value) - consts: list of (name, value, type)
- structs: list of (name, field_lines) - structs: list of (name, field_lines)
- rust_blocks: list of rust code blocks; each block is list[str] of code lines (no fences, uncommented) - rust_blocks: list of rust code blocks; each block is list[str] of code lines (no fences, uncommented)
""" """
functions: List[str] = [] functions: List[str] = []
consts: List[Tuple[str, str]] = [] consts: List[Tuple[str, str, str]] = []
structs: List[Tuple[str, List[str]]] = [] structs: List[Tuple[str, List[str]]] = []
rust_blocks: List[List[str]] = [] rust_blocks: List[List[str]] = []
@ -158,8 +159,20 @@ def parse_file(path: Path) -> Dict[str, Any]:
m = CONST_EQU_RE.match(s) m = CONST_EQU_RE.match(s)
if m: if m:
name = m.group(1) name = m.group(1)
value = m.group(2).strip() rest = m.group(2).strip()
consts.append((name, value)) # Defaults
value = rest
ctype = 'u32'
# If there's an inline comment (assembly comments start with ';'), split it off.
if ';' in rest:
val_part, comment_part = rest.split(';', 1)
value = val_part.strip()
# Strip any leading semicolons left in comment_part (e.g. ";; :u8")
comment = comment_part.lstrip(';').strip()
mtype = TYPE_ANNOT_RE.search(comment)
if mtype:
ctype = mtype.group(1).strip()
consts.append((name, value, ctype))
in_consts = False in_consts = False
const_buffer = [] const_buffer = []
i += 1 i += 1
@ -215,7 +228,7 @@ def parse_file(path: Path) -> Dict[str, Any]:
} }
def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]], def render_rust(function_sigs: List[str], consts: List[Tuple[str, str, str]],
structs: List[Tuple[str, List[str]]], rust_blocks: List[List[str]]) -> str: structs: List[Tuple[str, List[str]]], rust_blocks: List[List[str]]) -> str:
parts: List[str] = [] parts: List[str] = []
parts.append('#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]') parts.append('#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]')
@ -230,8 +243,8 @@ def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]],
parts.append('') # blank line parts.append('') # blank line
# Consts # Consts
for name, value in consts: for name, value, ctype in consts:
parts.append(f'pub const {name}: u32 = {value};') parts.append(f'pub const {name}: {ctype} = {value};')
if consts: if consts:
parts.append('') parts.append('')
@ -265,7 +278,7 @@ def main(argv=None):
args = parser.parse_args(argv) args = parser.parse_args(argv)
combined_functions: List[str] = [] combined_functions: List[str] = []
combined_consts: List[Tuple[str, str]] = [] combined_consts: List[Tuple[str, str, str]] = []
combined_structs: List[Tuple[str, List[str]]] = [] combined_structs: List[Tuple[str, List[str]]] = []
combined_rust_blocks: List[List[str]] = [] combined_rust_blocks: List[List[str]] = []

View file

@ -25,41 +25,43 @@ pub const TYPE_BOOL: u32 = 2;
pub const TYPE_I32: u32 = 3; pub const TYPE_I32: u32 = 3;
pub const TYPE_U32: u32 = 4; pub const TYPE_U32: u32 = 4;
pub const TYPE_STR: u32 = 5; pub const TYPE_STR: u32 = 5;
pub const TOKEN_EOF: u32 = 0; pub const TOKEN_EOF: u8 = 0;
pub const TOKEN_LET: u32 = 1; pub const TOKEN_LET: u8 = 1;
pub const TOKEN_IF: u32 = 2; pub const TOKEN_IF: u8 = 2;
pub const TOKEN_ELSE: u32 = 3; pub const TOKEN_ELSE: u8 = 3;
pub const TOKEN_FN: u32 = 4; pub const TOKEN_FN: u8 = 4;
pub const TOKEN_RETURN: u32 = 5; pub const TOKEN_RETURN: u8 = 5;
pub const TOKEN_LOOP: u32 = 6; pub const TOKEN_LOOP: u8 = 6;
pub const TOKEN_BREAK: u32 = 7; pub const TOKEN_BREAK: u8 = 7;
pub const TOKEN_CONTINUE: u32 = 8; pub const TOKEN_CONTINUE: u8 = 8;
pub const TOKEN_TRUE: u32 = 9; pub const TOKEN_TRUE: u8 = 9;
pub const TOKEN_FALSE: u32 = 10; pub const TOKEN_FALSE: u8 = 10;
pub const TOKEN_BOOL: u32 = 11; pub const TOKEN_BOOL: u8 = 11;
pub const TOKEN_ARROW: u32 = 12; pub const TOKEN_ARROW: u8 = 12;
pub const TOKEN_I32: u32 = 13; pub const TOKEN_I32: u8 = 13;
pub const TOKEN_U32: u32 = 14; pub const TOKEN_U32: u8 = 14;
pub const TOKEN_EQUALS: u32 = 15; pub const TOKEN_EQUALS: u8 = 15;
pub const TOKEN_PLUS: u32 = 16; pub const TOKEN_PLUS: u8 = 16;
pub const TOKEN_MINUS: u32 = 17; pub const TOKEN_MINUS: u8 = 17;
pub const TOKEN_RPARENS: u32 = 18; pub const TOKEN_RPARENS: u8 = 18;
pub const TOKEN_LPARENS: u32 = 19; pub const TOKEN_LPARENS: u8 = 19;
pub const TOKEN_RBRACE: u32 = 20; pub const TOKEN_RBRACE: u8 = 20;
pub const TOKEN_LBRACE: u32 = 21; pub const TOKEN_LBRACE: u8 = 21;
pub const TOKEN_COLON: u32 = 22; pub const TOKEN_COLON: u8 = 22;
pub const TOKEN_SEMI: u32 = 23; pub const TOKEN_SEMI: u8 = 23;
pub const TOKEN_COMMA: u32 = 24; pub const TOKEN_COMMA: u8 = 24;
pub const TOKEN_PIPE: u32 = 25; pub const TOKEN_PIPE: u8 = 25;
pub const TOKEN_AMP: u32 = 26; pub const TOKEN_AMP: u8 = 26;
pub const TOKEN_EQEQ: u32 = 27; pub const TOKEN_EQEQ: u8 = 27;
pub const TOKEN_LBRACKET: u32 = 28; pub const TOKEN_LBRACKET: u8 = 28;
pub const TOKEN_RBRACKET: u32 = 29; pub const TOKEN_RBRACKET: u8 = 29;
pub const TOKEN_VOID: u32 = 30; pub const TOKEN_VOID: u8 = 30;
pub const TOKEN_IDENT: u32 = 31; pub const TOKEN_SLASH: u8 = 31;
pub const TOKEN_NUMBER: u32 = 32; pub const TOKEN_STAR: u8 = 32;
pub const TOKEN_STRING: u32 = 33; pub const TOKEN_IDENT: u8 = 33;
pub const TOKEN_COMMENT: u32 = 34; pub const TOKEN_NUMBER: u8 = 34;
pub const TOKEN_STRING: u8 = 35;
pub const TOKEN_COMMENT: u8 = 36;
#[repr(C)] #[repr(C)]
#[derive(Debug)] #[derive(Debug)]

View file

@ -79,6 +79,8 @@ fn collect_tokens() -> Vec<Lexeme> {
fn main() { fn main() {
unsafe { unsafe {
use util::defs::*;
// assert initial state // assert initial state
assert_eq!((&raw const input_file).read(), 0); assert_eq!((&raw const input_file).read(), 0);
assert_eq!((&raw const buffer_len).read(), 0); assert_eq!((&raw const buffer_len).read(), 0);
@ -134,16 +136,16 @@ fn main() {
assert_eq!( assert_eq!(
&collect_tokens()[..], &collect_tokens()[..],
&[ &[
Lexeme(31, "this-is-an-ident"), Lexeme(TOKEN_IDENT, "this-is-an-ident"),
Lexeme(31, "another_ident123"), Lexeme(TOKEN_IDENT, "another_ident123"),
Lexeme(31, "_underscore_test"), Lexeme(TOKEN_IDENT, "_underscore_test"),
Lexeme(31, "mixedCASEIdent"), Lexeme(TOKEN_IDENT, "mixedCASEIdent"),
Lexeme(31, "number12345"), Lexeme(TOKEN_IDENT, "number12345"),
Lexeme(31, "____"), Lexeme(TOKEN_IDENT, "____"),
Lexeme(31, "_"), Lexeme(TOKEN_IDENT, "_"),
Lexeme(17, ""), Lexeme(17, ""),
Lexeme(31, "leading-minus"), Lexeme(TOKEN_IDENT, "leading-minus"),
Lexeme(31, "trailing-minus-"), Lexeme(TOKEN_IDENT, "trailing-minus-"),
] ]
); );
@ -155,7 +157,7 @@ fn main() {
&collect_tokens()[..], &collect_tokens()[..],
&[ &[
Lexeme(4, ""), Lexeme(4, ""),
Lexeme(31, "my-function"), Lexeme(TOKEN_IDENT, "my-function"),
Lexeme(19, ""), Lexeme(19, ""),
Lexeme(18, ""), Lexeme(18, ""),
Lexeme(12, ""), Lexeme(12, ""),
@ -185,7 +187,7 @@ fn main() {
&[ &[
Lexeme(34, ""), Lexeme(34, ""),
Lexeme(4, ""), Lexeme(4, ""),
Lexeme(31, "my-function"), Lexeme(TOKEN_IDENT, "my-function"),
Lexeme(19, ""), Lexeme(19, ""),
Lexeme(18, ""), Lexeme(18, ""),
Lexeme(12, ""), Lexeme(12, ""),
@ -206,11 +208,11 @@ fn main() {
assert_eq!( assert_eq!(
&collect_tokens()[..], &collect_tokens()[..],
&[ &[
Lexeme(32, "1234"), Lexeme(TOKEN_NUMBER, "1234"),
Lexeme(32, "123_345_"), Lexeme(TOKEN_NUMBER, "123_345_"),
Lexeme(32, "1234____56"), Lexeme(TOKEN_NUMBER, "1234____56"),
Lexeme(32, "1"), Lexeme(TOKEN_NUMBER, "1"),
Lexeme(32, "0"), Lexeme(TOKEN_NUMBER, "0"),
] ]
); );
@ -221,11 +223,11 @@ fn main() {
assert_eq!( assert_eq!(
&collect_tokens()[..], &collect_tokens()[..],
&[ &[
Lexeme(33, "\"this is a string\""), Lexeme(TOKEN_COMMENT, "\"this is a string\""),
Lexeme(33, "\"another\nstring\nspanning multiple\n lines\""), Lexeme(TOKEN_COMMENT, "\"another\nstring\nspanning multiple\n lines\""),
Lexeme(33, "\"string with a \\\"quoted\\\" word\""), Lexeme(TOKEN_COMMENT, "\"string with a \\\"quoted\\\" word\""),
Lexeme(33, "\"a\""), Lexeme(TOKEN_COMMENT, "\"a\""),
Lexeme(33, "\"\"") Lexeme(TOKEN_COMMENT, "\"\"")
], ],
); );
@ -236,7 +238,7 @@ fn main() {
assert_eq!( assert_eq!(
&collect_tokens()[..], &collect_tokens()[..],
&[Lexeme(32, "3"), Lexeme(16, "+"), Lexeme(32, "4")], &[Lexeme(TOKEN_NUMBER, "3"), Lexeme(16, "+"), Lexeme(TOKEN_NUMBER, "4")],
); );
eprint!("Initializing tokeniser.. "); eprint!("Initializing tokeniser.. ");
@ -248,16 +250,16 @@ fn main() {
&collect_tokens()[..], &collect_tokens()[..],
&[ &[
Lexeme(4, "fn"), Lexeme(4, "fn"),
Lexeme(31, "main"), Lexeme(TOKEN_IDENT, "main"),
Lexeme(19, "("), Lexeme(19, "("),
Lexeme(18, ")"), Lexeme(18, ")"),
Lexeme(12, "->"), Lexeme(12, "->"),
Lexeme(30, "void"), Lexeme(30, "void"),
Lexeme(21, "{"), Lexeme(21, "{"),
Lexeme(5, "return"), Lexeme(5, "return"),
Lexeme(32, "1"), Lexeme(TOKEN_NUMBER, "1"),
Lexeme(16, "+"), Lexeme(16, "+"),
Lexeme(32, "2"), Lexeme(TOKEN_NUMBER, "2"),
Lexeme(23, ";"), Lexeme(23, ";"),
Lexeme(20, "}"), Lexeme(20, "}"),
], ],