#!/usr/bin/env python3 """ parse_asm_to_rust.py Scan one or more assembly source files and extract: - commented struct definitions inside `start-structs` / `end-structs` spans - constant definitions inside `start-consts` / `end-consts` spans - commented function-definition directives of the form `define-fn: fn ...` - commented markdown rust fenced code blocks (```rust) and copy their inner code into the generated Rust output (fences are removed and comment markers stripped) Produce Rust source code containing: - an `extern "C"` block with `pub unsafe fn ...;` declarations for each define-fn - `pub const NAME: u32 = ;` lines for each `equ` constant found in const spans - `#[repr(C)] pub struct Name { pub field: Type, ... }` for each struct found in struct spans - verbatim Rust code copied from commented ```rust``` blocks (fences removed) Notes: - Struct and function definitions must appear on commented lines. Any number of leading semicolons (e.g. `;`, `;;`, `;;;`) and surrounding spaces are allowed and will be stripped. - Constant lines inside const spans may be commented or not; the script strips leading semicolons before parsing. - Commented rust blocks are expected to use commented fenced code blocks, e.g.: ;; ```rust ;; extern "C" { ... } ;; ``` The inner lines will be uncommented (leading semicolons removed) and included in output. - By default the script writes to stdout. Use `-o` to write combined output to a file, or `-d` to write one .rs file per input with the same basename. """ import argparse import re import sys from pathlib import Path from typing import List, Tuple, Dict, Any LEADING_COMMENT_RE = re.compile(r'^\s*;+\s*') # lines that start with one or more semicolons START_STRUCTS_RE = re.compile(r'^\s*;+\s*start-structs\b', re.IGNORECASE) END_STRUCTS_RE = re.compile(r'^\s*;+\s*end-structs\b', re.IGNORECASE) START_CONSTS_RE = re.compile(r'^\s*;+\s*start-consts\b', re.IGNORECASE) END_CONSTS_RE = re.compile(r'^\s*;+\s*end-consts\b', re.IGNORECASE) DEFINE_FN_RE = re.compile(r'^\s*;+\s*define-fn:\s*(.+)$', re.IGNORECASE) CONST_EQU_RE = re.compile(r'^\s*([A-Za-z_]\w*)\s+equ\s+(.+)$', re.IGNORECASE) STRUCT_START_RE = re.compile(r'^\s*struct\s+([A-Za-z_]\w*)\s*\{') # after comment markers stripped RUST_FENCE_RE = re.compile(r'^\s*```\s*(rust)?\s*$', re.IGNORECASE) # matches ``` or ```rust (after stripping leading comment) def strip_leading_semicolons(line: str) -> str: """Remove leading semicolons and surrounding spaces from a commented line.""" return LEADING_COMMENT_RE.sub('', line).rstrip('\n') def extract_structs_from_commented_lines(lines: List[str]) -> List[Tuple[str, List[str]]]: """ Given a list of lines (with comments already stripped of leading ';'), find all 'struct Name { ... }' blocks. Return list of (name, field_lines). This uses a simple brace-balanced scan so struct bodies can contain nested braces in types. """ structs = [] i = 0 n = len(lines) while i < n: m = STRUCT_START_RE.match(lines[i]) if m: name = m.group(1) body_lines = [] # Count braces: the opening brace on the start line brace_level = lines[i].count('{') - lines[i].count('}') i += 1 while i < n and brace_level > 0: line = lines[i] brace_level += line.count('{') - line.count('}') body_lines.append(line) i += 1 # Trim any trailing '}' line from body_lines if present if body_lines and body_lines[-1].strip() == '}': body_lines = body_lines[:-1] structs.append((name, body_lines)) else: i += 1 return structs def format_rust_struct(name: str, field_lines: List[str]) -> str: """ Convert a list of field lines like ' nodes: Vec,' into a Rust struct with pub fields and #[repr(C)]. Minimal parsing: split each field on the first ':' to find name and type, otherwise preserve line. """ out_lines = [] out_lines.append('#[repr(C)]') out_lines.append('#[derive(Debug)]') out_lines.append(f'pub struct {name} {{') for raw in field_lines: line = raw.strip().rstrip(',') if not line: continue if ':' in line: parts = line.split(':', 1) fname = parts[0].strip() ftype = parts[1].strip() out_lines.append(f' pub {fname}: {ftype},') else: out_lines.append(f' pub {line},') out_lines.append('}') return '\n'.join(out_lines) def parse_file(path: Path) -> Dict[str, Any]: """ Parse a single assembly file and return dict with keys: 'functions', 'consts', 'structs', 'rust_blocks' - functions: list of signature strings (e.g. "parse_ast(data: *const u8) -> Ast") - consts: list of (name, value) - structs: list of (name, field_lines) - rust_blocks: list of rust code blocks; each block is list[str] of code lines (no fences, uncommented) """ functions: List[str] = [] consts: List[Tuple[str, str]] = [] structs: List[Tuple[str, List[str]]] = [] rust_blocks: List[List[str]] = [] with path.open('r', encoding='utf-8') as f: lines = f.readlines() i = 0 n = len(lines) in_structs = False in_consts = False struct_buffer: List[str] = [] const_buffer: List[str] = [] while i < n: raw = lines[i] # state transitions for start/end spans if not in_structs and START_STRUCTS_RE.match(raw): in_structs = True struct_buffer = [] i += 1 continue if in_structs and END_STRUCTS_RE.match(raw): stripped = [strip_leading_semicolons(l) for l in struct_buffer if l.strip()] found = extract_structs_from_commented_lines(stripped) structs.extend(found) in_structs = False struct_buffer = [] i += 1 continue if not in_consts and START_CONSTS_RE.match(raw): in_consts = True const_buffer = [] i += 1 continue if in_consts and END_CONSTS_RE.match(raw): for l in const_buffer: s = strip_leading_semicolons(l) m = CONST_EQU_RE.match(s) if m: name = m.group(1) value = m.group(2).strip() consts.append((name, value)) in_consts = False const_buffer = [] i += 1 continue # If inside special spans, collect lines if in_structs: if LEADING_COMMENT_RE.match(raw): struct_buffer.append(raw) elif in_consts: const_buffer.append(raw) else: # Top-level: look for define-fn directives (must be commented lines) mfn = DEFINE_FN_RE.match(raw) if mfn: sig = mfn.group(1).strip() if sig.startswith('fn '): sig = sig[len('fn '):].strip() functions.append(sig) else: # Check for commented rust fenced block start if LEADING_COMMENT_RE.match(raw): stripped = strip_leading_semicolons(raw) if RUST_FENCE_RE.match(stripped): # start collecting rust block until a closing fence is found block_lines: List[str] = [] i += 1 while i < n: cur = lines[i] # If it's a commented fence closing, stop if LEADING_COMMENT_RE.match(cur): inner_stripped = strip_leading_semicolons(cur) if RUST_FENCE_RE.match(inner_stripped): break # otherwise, this is a commented code line; strip leading semicolons and append block_lines.append(strip_leading_semicolons(cur)) else: # If it's an uncommented line inside the block, include as-is (trim newline) block_lines.append(cur.rstrip('\n')) i += 1 rust_blocks.append(block_lines) # advance past the closing fence line if present # current i points at closing fence or EOF; advance one to continue main loop i += 1 continue # continue outer loop without incrementing i further i += 1 return { 'functions': functions, 'consts': consts, 'structs': structs, 'rust_blocks': rust_blocks, } def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]], structs: List[Tuple[str, List[str]]], rust_blocks: List[List[str]]) -> str: parts: List[str] = [] parts.append('#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]') parts.append('// Auto-generated Rust bindings from assembly source\n') # Functions: wrap in single extern "C" block if any if function_sigs: parts.append('unsafe extern "C" {') for sig in function_sigs: parts.append(f' pub unsafe fn {sig};') parts.append('}') parts.append('') # blank line # Consts for name, value in consts: parts.append(f'pub const {name}: u32 = {value};') if consts: parts.append('') # Structs for name, field_lines in structs: parts.append(format_rust_struct(name, field_lines)) parts.append('') # blank line between structs # Rust blocks copied verbatim (these are already uncommented and fence-less) for block in rust_blocks: # Ensure there's a blank line before inserted blocks for separation if parts and parts[-1] != '': parts.append('') # append each line exactly as collected parts.extend(line.rstrip('\n') for line in block) parts.append('') # trailing blank line after block # Trim trailing blank lines while parts and parts[-1] == '': parts.pop() return '\n'.join(parts) + '\n' if parts else '' def main(argv=None): parser = argparse.ArgumentParser(description='Parse assembly files and emit Rust externs, consts, struct defs, and commented ```rust``` blocks.') parser.add_argument('inputs', metavar='INPUT', type=Path, nargs='+', help='assembly source files to parse') group = parser.add_mutually_exclusive_group() group.add_argument('-o', '--out', type=Path, help='write combined Rust to this file (default stdout)') group.add_argument('-d', '--out-dir', type=Path, help='write one .rs file per input into this directory') args = parser.parse_args(argv) combined_functions: List[str] = [] combined_consts: List[Tuple[str, str]] = [] combined_structs: List[Tuple[str, List[str]]] = [] combined_rust_blocks: List[List[str]] = [] per_file_output: Dict[Path, str] = {} for inp in args.inputs: if not inp.exists(): print(f'warning: input file {inp} does not exist, skipping', file=sys.stderr) continue parsed = parse_file(inp) rust_src = render_rust(parsed['functions'], parsed['consts'], parsed['structs'], parsed['rust_blocks']) per_file_output[inp] = rust_src combined_functions.extend(parsed['functions']) combined_consts.extend(parsed['consts']) combined_structs.extend(parsed['structs']) combined_rust_blocks.extend(parsed['rust_blocks']) if args.out_dir: outdir = args.out_dir outdir.mkdir(parents=True, exist_ok=True) for inp, src in per_file_output.items(): outpath = outdir / (inp.stem + '.rs') with outpath.open('w', encoding='utf-8') as f: f.write(src) print(f'Wrote {outpath}', file=sys.stderr) return 0 combined_src = render_rust(combined_functions, combined_consts, combined_structs, combined_rust_blocks) if args.out: with args.out.open('w', encoding='utf-8') as f: f.write(combined_src) print(f'Wrote {args.out}', file=sys.stderr) else: sys.stdout.write(combined_src) return 0 if __name__ == '__main__': raise SystemExit(main())