diff --git a/lang/src/tokeniser.inc b/lang/src/tokeniser.inc index ca061b8..182894b 100644 --- a/lang/src/tokeniser.inc +++ b/lang/src/tokeniser.inc @@ -32,6 +32,8 @@ LEXEMES: dq LEX_LBRACKET dq LEX_RBRACKET dq LEX_VOID + dq LEX_SLASH + dq LEX_STAR align 8 TOKENS: @@ -66,6 +68,8 @@ TOKENS: db TOKEN_LBRACKET ;; 28 db TOKEN_RBRACKET ;; 29 db TOKEN_VOID ;; 30 + db TOKEN_SLASH ;; 31 + db TOKEN_STAR ;; 32 align 8 LEXEME_LENS: @@ -100,9 +104,11 @@ LEXEME_LENS: dq LEX_LBRACKET_len dq LEX_RBRACKET_len dq LEX_VOID_len + dq LEX_SLASH_len + dq LEX_STAR_len align 8 -NUM_LEXEMES: dq 31 +NUM_LEXEMES: dq 33 LEX_NOT_A_LEXEME db "", 0 LEX_LET db "let" @@ -165,6 +171,10 @@ NUM_LEXEMES: dq 31 LEX_RBRACKET_len equ $ - LEX_RBRACKET LEX_VOID db "void" LEX_VOID_len equ $ - LEX_VOID + LEX_SLASH db "/" + LEX_SLASH_len equ $ - LEX_SLASH + LEX_STAR db "*" + LEX_STAR_len equ $ - LEX_STAR LEX_IDENT db "" LEX_IDENT_len equ $ - LEX_IDENT LEX_NUMBER db "" @@ -175,39 +185,41 @@ NUM_LEXEMES: dq 31 LEX_COMMENT_len equ $ - LEX_COMMENT ;; start-consts - TOKEN_EOF equ 0 - TOKEN_LET equ 1 - TOKEN_IF equ 2 - TOKEN_ELSE equ 3 - TOKEN_FN equ 4 - TOKEN_RETURN equ 5 - TOKEN_LOOP equ 6 - TOKEN_BREAK equ 7 - TOKEN_CONTINUE equ 8 - TOKEN_TRUE equ 9 - TOKEN_FALSE equ 10 - TOKEN_BOOL equ 11 - TOKEN_ARROW equ 12 - TOKEN_I32 equ 13 - TOKEN_U32 equ 14 - TOKEN_EQUALS equ 15 - TOKEN_PLUS equ 16 - TOKEN_MINUS equ 17 - TOKEN_RPARENS equ 18 - TOKEN_LPARENS equ 19 - TOKEN_RBRACE equ 20 - TOKEN_LBRACE equ 21 - TOKEN_COLON equ 22 - TOKEN_SEMI equ 23 - TOKEN_COMMA equ 24 - TOKEN_PIPE equ 25 - TOKEN_AMP equ 26 - TOKEN_EQEQ equ 27 - TOKEN_LBRACKET equ 28 - TOKEN_RBRACKET equ 29 - TOKEN_VOID equ 30 - TOKEN_IDENT equ 31 - TOKEN_NUMBER equ 32 - TOKEN_STRING equ 33 - TOKEN_COMMENT equ 34 + TOKEN_EOF equ 0 ; :u8 + TOKEN_LET equ 1 ; :u8 + TOKEN_IF equ 2 ; :u8 + TOKEN_ELSE equ 3 ; :u8 + TOKEN_FN equ 4 ; :u8 + TOKEN_RETURN equ 5 ; :u8 + TOKEN_LOOP equ 6 ; :u8 + TOKEN_BREAK equ 7 ; :u8 + TOKEN_CONTINUE equ 8 ; :u8 + TOKEN_TRUE equ 9 ; :u8 + TOKEN_FALSE equ 10 ; :u8 + TOKEN_BOOL equ 11 ; :u8 + TOKEN_ARROW equ 12 ; :u8 + TOKEN_I32 equ 13 ; :u8 + TOKEN_U32 equ 14 ; :u8 + TOKEN_EQUALS equ 15 ; :u8 + TOKEN_PLUS equ 16 ; :u8 + TOKEN_MINUS equ 17 ; :u8 + TOKEN_RPARENS equ 18 ; :u8 + TOKEN_LPARENS equ 19 ; :u8 + TOKEN_RBRACE equ 20 ; :u8 + TOKEN_LBRACE equ 21 ; :u8 + TOKEN_COLON equ 22 ; :u8 + TOKEN_SEMI equ 23 ; :u8 + TOKEN_COMMA equ 24 ; :u8 + TOKEN_PIPE equ 25 ; :u8 + TOKEN_AMP equ 26 ; :u8 + TOKEN_EQEQ equ 27 ; :u8 + TOKEN_LBRACKET equ 28 ; :u8 + TOKEN_RBRACKET equ 29 ; :u8 + TOKEN_VOID equ 30 ; :u8 + TOKEN_SLASH equ 31 ; :u8 + TOKEN_STAR equ 32 ; :u8 + TOKEN_IDENT equ 33 ; :u8 + TOKEN_NUMBER equ 34 ; :u8 + TOKEN_STRING equ 35 ; :u8 + TOKEN_COMMENT equ 36 ; :u8 ;; end-consts diff --git a/lang/tests/asm_to_rust.py b/lang/tests/asm_to_rust.py index 4d16cd9..fca691c 100755 --- a/lang/tests/asm_to_rust.py +++ b/lang/tests/asm_to_rust.py @@ -44,6 +44,7 @@ DEFINE_FN_RE = re.compile(r'^\s*;+\s*define-fn:\s*(.+)$', re.IGNORECASE) CONST_EQU_RE = re.compile(r'^\s*([A-Za-z_]\w*)\s+equ\s+(.+)$', re.IGNORECASE) STRUCT_START_RE = re.compile(r'^\s*struct\s+([A-Za-z_]\w*)\s*\{') # after comment markers stripped RUST_FENCE_RE = re.compile(r'^\s*```\s*(rust)?\s*$', re.IGNORECASE) # matches ``` or ```rust (after stripping leading comment) +TYPE_ANNOT_RE = re.compile(r':\s*([A-Za-z0-9_\<\>\*\s\[\]\:&]+)') # matches :u8, : *const u8, Vec, etc. def strip_leading_semicolons(line: str) -> str: @@ -110,12 +111,12 @@ def parse_file(path: Path) -> Dict[str, Any]: """ Parse a single assembly file and return dict with keys: 'functions', 'consts', 'structs', 'rust_blocks' - functions: list of signature strings (e.g. "parse_ast(data: *const u8) -> Ast") - - consts: list of (name, value) + - consts: list of (name, value, type) - structs: list of (name, field_lines) - rust_blocks: list of rust code blocks; each block is list[str] of code lines (no fences, uncommented) """ functions: List[str] = [] - consts: List[Tuple[str, str]] = [] + consts: List[Tuple[str, str, str]] = [] structs: List[Tuple[str, List[str]]] = [] rust_blocks: List[List[str]] = [] @@ -158,8 +159,20 @@ def parse_file(path: Path) -> Dict[str, Any]: m = CONST_EQU_RE.match(s) if m: name = m.group(1) - value = m.group(2).strip() - consts.append((name, value)) + rest = m.group(2).strip() + # Defaults + value = rest + ctype = 'u32' + # If there's an inline comment (assembly comments start with ';'), split it off. + if ';' in rest: + val_part, comment_part = rest.split(';', 1) + value = val_part.strip() + # Strip any leading semicolons left in comment_part (e.g. ";; :u8") + comment = comment_part.lstrip(';').strip() + mtype = TYPE_ANNOT_RE.search(comment) + if mtype: + ctype = mtype.group(1).strip() + consts.append((name, value, ctype)) in_consts = False const_buffer = [] i += 1 @@ -215,7 +228,7 @@ def parse_file(path: Path) -> Dict[str, Any]: } -def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]], +def render_rust(function_sigs: List[str], consts: List[Tuple[str, str, str]], structs: List[Tuple[str, List[str]]], rust_blocks: List[List[str]]) -> str: parts: List[str] = [] parts.append('#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]') @@ -230,8 +243,8 @@ def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]], parts.append('') # blank line # Consts - for name, value in consts: - parts.append(f'pub const {name}: u32 = {value};') + for name, value, ctype in consts: + parts.append(f'pub const {name}: {ctype} = {value};') if consts: parts.append('') @@ -265,7 +278,7 @@ def main(argv=None): args = parser.parse_args(argv) combined_functions: List[str] = [] - combined_consts: List[Tuple[str, str]] = [] + combined_consts: List[Tuple[str, str, str]] = [] combined_structs: List[Tuple[str, List[str]]] = [] combined_rust_blocks: List[List[str]] = [] diff --git a/lang/tests/shared/defs.rs b/lang/tests/shared/defs.rs index 541e684..6b1cb5b 100644 --- a/lang/tests/shared/defs.rs +++ b/lang/tests/shared/defs.rs @@ -25,41 +25,43 @@ pub const TYPE_BOOL: u32 = 2; pub const TYPE_I32: u32 = 3; pub const TYPE_U32: u32 = 4; pub const TYPE_STR: u32 = 5; -pub const TOKEN_EOF: u32 = 0; -pub const TOKEN_LET: u32 = 1; -pub const TOKEN_IF: u32 = 2; -pub const TOKEN_ELSE: u32 = 3; -pub const TOKEN_FN: u32 = 4; -pub const TOKEN_RETURN: u32 = 5; -pub const TOKEN_LOOP: u32 = 6; -pub const TOKEN_BREAK: u32 = 7; -pub const TOKEN_CONTINUE: u32 = 8; -pub const TOKEN_TRUE: u32 = 9; -pub const TOKEN_FALSE: u32 = 10; -pub const TOKEN_BOOL: u32 = 11; -pub const TOKEN_ARROW: u32 = 12; -pub const TOKEN_I32: u32 = 13; -pub const TOKEN_U32: u32 = 14; -pub const TOKEN_EQUALS: u32 = 15; -pub const TOKEN_PLUS: u32 = 16; -pub const TOKEN_MINUS: u32 = 17; -pub const TOKEN_RPARENS: u32 = 18; -pub const TOKEN_LPARENS: u32 = 19; -pub const TOKEN_RBRACE: u32 = 20; -pub const TOKEN_LBRACE: u32 = 21; -pub const TOKEN_COLON: u32 = 22; -pub const TOKEN_SEMI: u32 = 23; -pub const TOKEN_COMMA: u32 = 24; -pub const TOKEN_PIPE: u32 = 25; -pub const TOKEN_AMP: u32 = 26; -pub const TOKEN_EQEQ: u32 = 27; -pub const TOKEN_LBRACKET: u32 = 28; -pub const TOKEN_RBRACKET: u32 = 29; -pub const TOKEN_VOID: u32 = 30; -pub const TOKEN_IDENT: u32 = 31; -pub const TOKEN_NUMBER: u32 = 32; -pub const TOKEN_STRING: u32 = 33; -pub const TOKEN_COMMENT: u32 = 34; +pub const TOKEN_EOF: u8 = 0; +pub const TOKEN_LET: u8 = 1; +pub const TOKEN_IF: u8 = 2; +pub const TOKEN_ELSE: u8 = 3; +pub const TOKEN_FN: u8 = 4; +pub const TOKEN_RETURN: u8 = 5; +pub const TOKEN_LOOP: u8 = 6; +pub const TOKEN_BREAK: u8 = 7; +pub const TOKEN_CONTINUE: u8 = 8; +pub const TOKEN_TRUE: u8 = 9; +pub const TOKEN_FALSE: u8 = 10; +pub const TOKEN_BOOL: u8 = 11; +pub const TOKEN_ARROW: u8 = 12; +pub const TOKEN_I32: u8 = 13; +pub const TOKEN_U32: u8 = 14; +pub const TOKEN_EQUALS: u8 = 15; +pub const TOKEN_PLUS: u8 = 16; +pub const TOKEN_MINUS: u8 = 17; +pub const TOKEN_RPARENS: u8 = 18; +pub const TOKEN_LPARENS: u8 = 19; +pub const TOKEN_RBRACE: u8 = 20; +pub const TOKEN_LBRACE: u8 = 21; +pub const TOKEN_COLON: u8 = 22; +pub const TOKEN_SEMI: u8 = 23; +pub const TOKEN_COMMA: u8 = 24; +pub const TOKEN_PIPE: u8 = 25; +pub const TOKEN_AMP: u8 = 26; +pub const TOKEN_EQEQ: u8 = 27; +pub const TOKEN_LBRACKET: u8 = 28; +pub const TOKEN_RBRACKET: u8 = 29; +pub const TOKEN_VOID: u8 = 30; +pub const TOKEN_SLASH: u8 = 31; +pub const TOKEN_STAR: u8 = 32; +pub const TOKEN_IDENT: u8 = 33; +pub const TOKEN_NUMBER: u8 = 34; +pub const TOKEN_STRING: u8 = 35; +pub const TOKEN_COMMENT: u8 = 36; #[repr(C)] #[derive(Debug)] diff --git a/lang/tests/tokens.rs b/lang/tests/tokens.rs index 9236ce2..1eb53a5 100644 --- a/lang/tests/tokens.rs +++ b/lang/tests/tokens.rs @@ -79,6 +79,8 @@ fn collect_tokens() -> Vec { fn main() { unsafe { + use util::defs::*; + // assert initial state assert_eq!((&raw const input_file).read(), 0); assert_eq!((&raw const buffer_len).read(), 0); @@ -134,16 +136,16 @@ fn main() { assert_eq!( &collect_tokens()[..], &[ - Lexeme(31, "this-is-an-ident"), - Lexeme(31, "another_ident123"), - Lexeme(31, "_underscore_test"), - Lexeme(31, "mixedCASEIdent"), - Lexeme(31, "number12345"), - Lexeme(31, "____"), - Lexeme(31, "_"), + Lexeme(TOKEN_IDENT, "this-is-an-ident"), + Lexeme(TOKEN_IDENT, "another_ident123"), + Lexeme(TOKEN_IDENT, "_underscore_test"), + Lexeme(TOKEN_IDENT, "mixedCASEIdent"), + Lexeme(TOKEN_IDENT, "number12345"), + Lexeme(TOKEN_IDENT, "____"), + Lexeme(TOKEN_IDENT, "_"), Lexeme(17, ""), - Lexeme(31, "leading-minus"), - Lexeme(31, "trailing-minus-"), + Lexeme(TOKEN_IDENT, "leading-minus"), + Lexeme(TOKEN_IDENT, "trailing-minus-"), ] ); @@ -155,7 +157,7 @@ fn main() { &collect_tokens()[..], &[ Lexeme(4, ""), - Lexeme(31, "my-function"), + Lexeme(TOKEN_IDENT, "my-function"), Lexeme(19, ""), Lexeme(18, ""), Lexeme(12, ""), @@ -185,7 +187,7 @@ fn main() { &[ Lexeme(34, ""), Lexeme(4, ""), - Lexeme(31, "my-function"), + Lexeme(TOKEN_IDENT, "my-function"), Lexeme(19, ""), Lexeme(18, ""), Lexeme(12, ""), @@ -206,11 +208,11 @@ fn main() { assert_eq!( &collect_tokens()[..], &[ - Lexeme(32, "1234"), - Lexeme(32, "123_345_"), - Lexeme(32, "1234____56"), - Lexeme(32, "1"), - Lexeme(32, "0"), + Lexeme(TOKEN_NUMBER, "1234"), + Lexeme(TOKEN_NUMBER, "123_345_"), + Lexeme(TOKEN_NUMBER, "1234____56"), + Lexeme(TOKEN_NUMBER, "1"), + Lexeme(TOKEN_NUMBER, "0"), ] ); @@ -221,11 +223,11 @@ fn main() { assert_eq!( &collect_tokens()[..], &[ - Lexeme(33, "\"this is a string\""), - Lexeme(33, "\"another\nstring\nspanning multiple\n lines\""), - Lexeme(33, "\"string with a \\\"quoted\\\" word\""), - Lexeme(33, "\"a\""), - Lexeme(33, "\"\"") + Lexeme(TOKEN_COMMENT, "\"this is a string\""), + Lexeme(TOKEN_COMMENT, "\"another\nstring\nspanning multiple\n lines\""), + Lexeme(TOKEN_COMMENT, "\"string with a \\\"quoted\\\" word\""), + Lexeme(TOKEN_COMMENT, "\"a\""), + Lexeme(TOKEN_COMMENT, "\"\"") ], ); @@ -236,7 +238,7 @@ fn main() { assert_eq!( &collect_tokens()[..], - &[Lexeme(32, "3"), Lexeme(16, "+"), Lexeme(32, "4")], + &[Lexeme(TOKEN_NUMBER, "3"), Lexeme(16, "+"), Lexeme(TOKEN_NUMBER, "4")], ); eprint!("Initializing tokeniser.. "); @@ -248,16 +250,16 @@ fn main() { &collect_tokens()[..], &[ Lexeme(4, "fn"), - Lexeme(31, "main"), + Lexeme(TOKEN_IDENT, "main"), Lexeme(19, "("), Lexeme(18, ")"), Lexeme(12, "->"), Lexeme(30, "void"), Lexeme(21, "{"), Lexeme(5, "return"), - Lexeme(32, "1"), + Lexeme(TOKEN_NUMBER, "1"), Lexeme(16, "+"), - Lexeme(32, "2"), + Lexeme(TOKEN_NUMBER, "2"), Lexeme(23, ";"), Lexeme(20, "}"), ],