311 lines
12 KiB
Python
Executable file
311 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
parse_asm_to_rust.py
|
|
|
|
Scan one or more assembly source files and extract:
|
|
- commented struct definitions inside `start-structs` / `end-structs` spans
|
|
- constant definitions inside `start-consts` / `end-consts` spans
|
|
- commented function-definition directives of the form `define-fn: fn ...`
|
|
- commented markdown rust fenced code blocks (```rust) and copy their inner code
|
|
into the generated Rust output (fences are removed and comment markers stripped)
|
|
|
|
Produce Rust source code containing:
|
|
- an `extern "C"` block with `pub unsafe fn ...;` declarations for each define-fn
|
|
- `pub const NAME: u32 = <value>;` lines for each `equ` constant found in const spans
|
|
- `#[repr(C)] pub struct Name { pub field: Type, ... }` for each struct found in struct spans
|
|
- verbatim Rust code copied from commented ```rust``` blocks (fences removed)
|
|
|
|
Notes:
|
|
- Struct and function definitions must appear on commented lines. Any number of leading semicolons
|
|
(e.g. `;`, `;;`, `;;;`) and surrounding spaces are allowed and will be stripped.
|
|
- Constant lines inside const spans may be commented or not; the script strips leading semicolons
|
|
before parsing.
|
|
- Commented rust blocks are expected to use commented fenced code blocks, e.g.:
|
|
;; ```rust
|
|
;; extern "C" { ... }
|
|
;; ```
|
|
The inner lines will be uncommented (leading semicolons removed) and included in output.
|
|
- By default the script writes to stdout. Use `-o` to write combined output to a file, or `-d`
|
|
to write one .rs file per input with the same basename.
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Dict, Any
|
|
|
|
LEADING_COMMENT_RE = re.compile(r'^\s*;+\s*') # lines that start with one or more semicolons
|
|
START_STRUCTS_RE = re.compile(r'^\s*;+\s*start-structs\b', re.IGNORECASE)
|
|
END_STRUCTS_RE = re.compile(r'^\s*;+\s*end-structs\b', re.IGNORECASE)
|
|
START_CONSTS_RE = re.compile(r'^\s*;+\s*start-consts\b', re.IGNORECASE)
|
|
END_CONSTS_RE = re.compile(r'^\s*;+\s*end-consts\b', re.IGNORECASE)
|
|
DEFINE_FN_RE = re.compile(r'^\s*;+\s*define-fn:\s*(.+)$', re.IGNORECASE)
|
|
CONST_EQU_RE = re.compile(r'^\s*([A-Za-z_]\w*)\s+equ\s+(.+)$', re.IGNORECASE)
|
|
STRUCT_START_RE = re.compile(r'^\s*struct\s+([A-Za-z_]\w*)\s*\{') # after comment markers stripped
|
|
RUST_FENCE_RE = re.compile(r'^\s*```\s*(rust)?\s*$', re.IGNORECASE) # matches ``` or ```rust (after stripping leading comment)
|
|
|
|
|
|
def strip_leading_semicolons(line: str) -> str:
|
|
"""Remove leading semicolons and surrounding spaces from a commented line."""
|
|
return LEADING_COMMENT_RE.sub('', line).rstrip('\n')
|
|
|
|
|
|
def extract_structs_from_commented_lines(lines: List[str]) -> List[Tuple[str, List[str]]]:
|
|
"""
|
|
Given a list of lines (with comments already stripped of leading ';'), find all 'struct Name { ... }'
|
|
blocks. Return list of (name, field_lines).
|
|
This uses a simple brace-balanced scan so struct bodies can contain nested braces in types.
|
|
"""
|
|
structs = []
|
|
i = 0
|
|
n = len(lines)
|
|
while i < n:
|
|
m = STRUCT_START_RE.match(lines[i])
|
|
if m:
|
|
name = m.group(1)
|
|
body_lines = []
|
|
# Count braces: the opening brace on the start line
|
|
brace_level = lines[i].count('{') - lines[i].count('}')
|
|
i += 1
|
|
while i < n and brace_level > 0:
|
|
line = lines[i]
|
|
brace_level += line.count('{') - line.count('}')
|
|
body_lines.append(line)
|
|
i += 1
|
|
# Trim any trailing '}' line from body_lines if present
|
|
if body_lines and body_lines[-1].strip() == '}':
|
|
body_lines = body_lines[:-1]
|
|
structs.append((name, body_lines))
|
|
else:
|
|
i += 1
|
|
return structs
|
|
|
|
|
|
def format_rust_struct(name: str, field_lines: List[str]) -> str:
|
|
"""
|
|
Convert a list of field lines like ' nodes: Vec<AstNode>,' into a Rust struct with pub fields and #[repr(C)].
|
|
Minimal parsing: split each field on the first ':' to find name and type, otherwise preserve line.
|
|
"""
|
|
out_lines = []
|
|
out_lines.append('#[repr(C)]')
|
|
out_lines.append('#[derive(Debug)]')
|
|
out_lines.append(f'pub struct {name} {{')
|
|
for raw in field_lines:
|
|
line = raw.strip().rstrip(',')
|
|
if not line:
|
|
continue
|
|
if ':' in line:
|
|
parts = line.split(':', 1)
|
|
fname = parts[0].strip()
|
|
ftype = parts[1].strip()
|
|
out_lines.append(f' pub {fname}: {ftype},')
|
|
else:
|
|
out_lines.append(f' pub {line},')
|
|
out_lines.append('}')
|
|
return '\n'.join(out_lines)
|
|
|
|
|
|
def parse_file(path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Parse a single assembly file and return dict with keys: 'functions', 'consts', 'structs', 'rust_blocks'
|
|
- functions: list of signature strings (e.g. "parse_ast(data: *const u8) -> Ast")
|
|
- consts: list of (name, value)
|
|
- structs: list of (name, field_lines)
|
|
- rust_blocks: list of rust code blocks; each block is list[str] of code lines (no fences, uncommented)
|
|
"""
|
|
functions: List[str] = []
|
|
consts: List[Tuple[str, str]] = []
|
|
structs: List[Tuple[str, List[str]]] = []
|
|
rust_blocks: List[List[str]] = []
|
|
|
|
with path.open('r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
i = 0
|
|
n = len(lines)
|
|
in_structs = False
|
|
in_consts = False
|
|
struct_buffer: List[str] = []
|
|
const_buffer: List[str] = []
|
|
|
|
while i < n:
|
|
raw = lines[i]
|
|
|
|
# state transitions for start/end spans
|
|
if not in_structs and START_STRUCTS_RE.match(raw):
|
|
in_structs = True
|
|
struct_buffer = []
|
|
i += 1
|
|
continue
|
|
if in_structs and END_STRUCTS_RE.match(raw):
|
|
stripped = [strip_leading_semicolons(l) for l in struct_buffer if l.strip()]
|
|
found = extract_structs_from_commented_lines(stripped)
|
|
structs.extend(found)
|
|
in_structs = False
|
|
struct_buffer = []
|
|
i += 1
|
|
continue
|
|
|
|
if not in_consts and START_CONSTS_RE.match(raw):
|
|
in_consts = True
|
|
const_buffer = []
|
|
i += 1
|
|
continue
|
|
if in_consts and END_CONSTS_RE.match(raw):
|
|
for l in const_buffer:
|
|
s = strip_leading_semicolons(l)
|
|
m = CONST_EQU_RE.match(s)
|
|
if m:
|
|
name = m.group(1)
|
|
value = m.group(2).strip()
|
|
consts.append((name, value))
|
|
in_consts = False
|
|
const_buffer = []
|
|
i += 1
|
|
continue
|
|
|
|
# If inside special spans, collect lines
|
|
if in_structs:
|
|
if LEADING_COMMENT_RE.match(raw):
|
|
struct_buffer.append(raw)
|
|
elif in_consts:
|
|
const_buffer.append(raw)
|
|
else:
|
|
# Top-level: look for define-fn directives (must be commented lines)
|
|
mfn = DEFINE_FN_RE.match(raw)
|
|
if mfn:
|
|
sig = mfn.group(1).strip()
|
|
if sig.startswith('fn '):
|
|
sig = sig[len('fn '):].strip()
|
|
functions.append(sig)
|
|
else:
|
|
# Check for commented rust fenced block start
|
|
if LEADING_COMMENT_RE.match(raw):
|
|
stripped = strip_leading_semicolons(raw)
|
|
if RUST_FENCE_RE.match(stripped):
|
|
# start collecting rust block until a closing fence is found
|
|
block_lines: List[str] = []
|
|
i += 1
|
|
while i < n:
|
|
cur = lines[i]
|
|
# If it's a commented fence closing, stop
|
|
if LEADING_COMMENT_RE.match(cur):
|
|
inner_stripped = strip_leading_semicolons(cur)
|
|
if RUST_FENCE_RE.match(inner_stripped):
|
|
break
|
|
# otherwise, this is a commented code line; strip leading semicolons and append
|
|
block_lines.append(strip_leading_semicolons(cur))
|
|
else:
|
|
# If it's an uncommented line inside the block, include as-is (trim newline)
|
|
block_lines.append(cur.rstrip('\n'))
|
|
i += 1
|
|
rust_blocks.append(block_lines)
|
|
# advance past the closing fence line if present
|
|
# current i points at closing fence or EOF; advance one to continue main loop
|
|
i += 1
|
|
continue # continue outer loop without incrementing i further
|
|
i += 1
|
|
|
|
return {
|
|
'functions': functions,
|
|
'consts': consts,
|
|
'structs': structs,
|
|
'rust_blocks': rust_blocks,
|
|
}
|
|
|
|
|
|
def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]],
|
|
structs: List[Tuple[str, List[str]]], rust_blocks: List[List[str]]) -> str:
|
|
parts: List[str] = []
|
|
parts.append('#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]')
|
|
parts.append('// Auto-generated Rust bindings from assembly source\n')
|
|
|
|
# Functions: wrap in single extern "C" block if any
|
|
if function_sigs:
|
|
parts.append('unsafe extern "C" {')
|
|
for sig in function_sigs:
|
|
parts.append(f' pub unsafe fn {sig};')
|
|
parts.append('}')
|
|
parts.append('') # blank line
|
|
|
|
# Consts
|
|
for name, value in consts:
|
|
parts.append(f'pub const {name}: u32 = {value};')
|
|
if consts:
|
|
parts.append('')
|
|
|
|
# Structs
|
|
for name, field_lines in structs:
|
|
parts.append(format_rust_struct(name, field_lines))
|
|
parts.append('') # blank line between structs
|
|
|
|
# Rust blocks copied verbatim (these are already uncommented and fence-less)
|
|
for block in rust_blocks:
|
|
# Ensure there's a blank line before inserted blocks for separation
|
|
if parts and parts[-1] != '':
|
|
parts.append('')
|
|
# append each line exactly as collected
|
|
parts.extend(line.rstrip('\n') for line in block)
|
|
parts.append('') # trailing blank line after block
|
|
|
|
# Trim trailing blank lines
|
|
while parts and parts[-1] == '':
|
|
parts.pop()
|
|
|
|
return '\n'.join(parts) + '\n' if parts else ''
|
|
|
|
|
|
def main(argv=None):
|
|
parser = argparse.ArgumentParser(description='Parse assembly files and emit Rust externs, consts, struct defs, and commented ```rust``` blocks.')
|
|
parser.add_argument('inputs', metavar='INPUT', type=Path, nargs='+', help='assembly source files to parse')
|
|
group = parser.add_mutually_exclusive_group()
|
|
group.add_argument('-o', '--out', type=Path, help='write combined Rust to this file (default stdout)')
|
|
group.add_argument('-d', '--out-dir', type=Path, help='write one .rs file per input into this directory')
|
|
args = parser.parse_args(argv)
|
|
|
|
combined_functions: List[str] = []
|
|
combined_consts: List[Tuple[str, str]] = []
|
|
combined_structs: List[Tuple[str, List[str]]] = []
|
|
combined_rust_blocks: List[List[str]] = []
|
|
|
|
per_file_output: Dict[Path, str] = {}
|
|
|
|
for inp in args.inputs:
|
|
if not inp.exists():
|
|
print(f'warning: input file {inp} does not exist, skipping', file=sys.stderr)
|
|
continue
|
|
parsed = parse_file(inp)
|
|
rust_src = render_rust(parsed['functions'], parsed['consts'], parsed['structs'], parsed['rust_blocks'])
|
|
per_file_output[inp] = rust_src
|
|
|
|
combined_functions.extend(parsed['functions'])
|
|
combined_consts.extend(parsed['consts'])
|
|
combined_structs.extend(parsed['structs'])
|
|
combined_rust_blocks.extend(parsed['rust_blocks'])
|
|
|
|
if args.out_dir:
|
|
outdir = args.out_dir
|
|
outdir.mkdir(parents=True, exist_ok=True)
|
|
for inp, src in per_file_output.items():
|
|
outpath = outdir / (inp.stem + '.rs')
|
|
with outpath.open('w', encoding='utf-8') as f:
|
|
f.write(src)
|
|
print(f'Wrote {outpath}', file=sys.stderr)
|
|
return 0
|
|
|
|
combined_src = render_rust(combined_functions, combined_consts, combined_structs, combined_rust_blocks)
|
|
|
|
if args.out:
|
|
with args.out.open('w', encoding='utf-8') as f:
|
|
f.write(combined_src)
|
|
print(f'Wrote {args.out}', file=sys.stderr)
|
|
else:
|
|
sys.stdout.write(combined_src)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
raise SystemExit(main())
|