from-scratch/lang/tests/asm_to_rust.py

311 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
parse_asm_to_rust.py
Scan one or more assembly source files and extract:
- commented struct definitions inside `start-structs` / `end-structs` spans
- constant definitions inside `start-consts` / `end-consts` spans
- commented function-definition directives of the form `define-fn: fn ...`
- commented markdown rust fenced code blocks (```rust) and copy their inner code
into the generated Rust output (fences are removed and comment markers stripped)
Produce Rust source code containing:
- an `extern "C"` block with `pub unsafe fn ...;` declarations for each define-fn
- `pub const NAME: u32 = <value>;` lines for each `equ` constant found in const spans
- `#[repr(C)] pub struct Name { pub field: Type, ... }` for each struct found in struct spans
- verbatim Rust code copied from commented ```rust``` blocks (fences removed)
Notes:
- Struct and function definitions must appear on commented lines. Any number of leading semicolons
(e.g. `;`, `;;`, `;;;`) and surrounding spaces are allowed and will be stripped.
- Constant lines inside const spans may be commented or not; the script strips leading semicolons
before parsing.
- Commented rust blocks are expected to use commented fenced code blocks, e.g.:
;; ```rust
;; extern "C" { ... }
;; ```
The inner lines will be uncommented (leading semicolons removed) and included in output.
- By default the script writes to stdout. Use `-o` to write combined output to a file, or `-d`
to write one .rs file per input with the same basename.
"""
import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple, Dict, Any
LEADING_COMMENT_RE = re.compile(r'^\s*;+\s*') # lines that start with one or more semicolons
START_STRUCTS_RE = re.compile(r'^\s*;+\s*start-structs\b', re.IGNORECASE)
END_STRUCTS_RE = re.compile(r'^\s*;+\s*end-structs\b', re.IGNORECASE)
START_CONSTS_RE = re.compile(r'^\s*;+\s*start-consts\b', re.IGNORECASE)
END_CONSTS_RE = re.compile(r'^\s*;+\s*end-consts\b', re.IGNORECASE)
DEFINE_FN_RE = re.compile(r'^\s*;+\s*define-fn:\s*(.+)$', re.IGNORECASE)
CONST_EQU_RE = re.compile(r'^\s*([A-Za-z_]\w*)\s+equ\s+(.+)$', re.IGNORECASE)
STRUCT_START_RE = re.compile(r'^\s*struct\s+([A-Za-z_]\w*)\s*\{') # after comment markers stripped
RUST_FENCE_RE = re.compile(r'^\s*```\s*(rust)?\s*$', re.IGNORECASE) # matches ``` or ```rust (after stripping leading comment)
def strip_leading_semicolons(line: str) -> str:
"""Remove leading semicolons and surrounding spaces from a commented line."""
return LEADING_COMMENT_RE.sub('', line).rstrip('\n')
def extract_structs_from_commented_lines(lines: List[str]) -> List[Tuple[str, List[str]]]:
"""
Given a list of lines (with comments already stripped of leading ';'), find all 'struct Name { ... }'
blocks. Return list of (name, field_lines).
This uses a simple brace-balanced scan so struct bodies can contain nested braces in types.
"""
structs = []
i = 0
n = len(lines)
while i < n:
m = STRUCT_START_RE.match(lines[i])
if m:
name = m.group(1)
body_lines = []
# Count braces: the opening brace on the start line
brace_level = lines[i].count('{') - lines[i].count('}')
i += 1
while i < n and brace_level > 0:
line = lines[i]
brace_level += line.count('{') - line.count('}')
body_lines.append(line)
i += 1
# Trim any trailing '}' line from body_lines if present
if body_lines and body_lines[-1].strip() == '}':
body_lines = body_lines[:-1]
structs.append((name, body_lines))
else:
i += 1
return structs
def format_rust_struct(name: str, field_lines: List[str]) -> str:
"""
Convert a list of field lines like ' nodes: Vec<AstNode>,' into a Rust struct with pub fields and #[repr(C)].
Minimal parsing: split each field on the first ':' to find name and type, otherwise preserve line.
"""
out_lines = []
out_lines.append('#[repr(C)]')
out_lines.append('#[derive(Debug)]')
out_lines.append(f'pub struct {name} {{')
for raw in field_lines:
line = raw.strip().rstrip(',')
if not line:
continue
if ':' in line:
parts = line.split(':', 1)
fname = parts[0].strip()
ftype = parts[1].strip()
out_lines.append(f' pub {fname}: {ftype},')
else:
out_lines.append(f' pub {line},')
out_lines.append('}')
return '\n'.join(out_lines)
def parse_file(path: Path) -> Dict[str, Any]:
"""
Parse a single assembly file and return dict with keys: 'functions', 'consts', 'structs', 'rust_blocks'
- functions: list of signature strings (e.g. "parse_ast(data: *const u8) -> Ast")
- consts: list of (name, value)
- structs: list of (name, field_lines)
- rust_blocks: list of rust code blocks; each block is list[str] of code lines (no fences, uncommented)
"""
functions: List[str] = []
consts: List[Tuple[str, str]] = []
structs: List[Tuple[str, List[str]]] = []
rust_blocks: List[List[str]] = []
with path.open('r', encoding='utf-8') as f:
lines = f.readlines()
i = 0
n = len(lines)
in_structs = False
in_consts = False
struct_buffer: List[str] = []
const_buffer: List[str] = []
while i < n:
raw = lines[i]
# state transitions for start/end spans
if not in_structs and START_STRUCTS_RE.match(raw):
in_structs = True
struct_buffer = []
i += 1
continue
if in_structs and END_STRUCTS_RE.match(raw):
stripped = [strip_leading_semicolons(l) for l in struct_buffer if l.strip()]
found = extract_structs_from_commented_lines(stripped)
structs.extend(found)
in_structs = False
struct_buffer = []
i += 1
continue
if not in_consts and START_CONSTS_RE.match(raw):
in_consts = True
const_buffer = []
i += 1
continue
if in_consts and END_CONSTS_RE.match(raw):
for l in const_buffer:
s = strip_leading_semicolons(l)
m = CONST_EQU_RE.match(s)
if m:
name = m.group(1)
value = m.group(2).strip()
consts.append((name, value))
in_consts = False
const_buffer = []
i += 1
continue
# If inside special spans, collect lines
if in_structs:
if LEADING_COMMENT_RE.match(raw):
struct_buffer.append(raw)
elif in_consts:
const_buffer.append(raw)
else:
# Top-level: look for define-fn directives (must be commented lines)
mfn = DEFINE_FN_RE.match(raw)
if mfn:
sig = mfn.group(1).strip()
if sig.startswith('fn '):
sig = sig[len('fn '):].strip()
functions.append(sig)
else:
# Check for commented rust fenced block start
if LEADING_COMMENT_RE.match(raw):
stripped = strip_leading_semicolons(raw)
if RUST_FENCE_RE.match(stripped):
# start collecting rust block until a closing fence is found
block_lines: List[str] = []
i += 1
while i < n:
cur = lines[i]
# If it's a commented fence closing, stop
if LEADING_COMMENT_RE.match(cur):
inner_stripped = strip_leading_semicolons(cur)
if RUST_FENCE_RE.match(inner_stripped):
break
# otherwise, this is a commented code line; strip leading semicolons and append
block_lines.append(strip_leading_semicolons(cur))
else:
# If it's an uncommented line inside the block, include as-is (trim newline)
block_lines.append(cur.rstrip('\n'))
i += 1
rust_blocks.append(block_lines)
# advance past the closing fence line if present
# current i points at closing fence or EOF; advance one to continue main loop
i += 1
continue # continue outer loop without incrementing i further
i += 1
return {
'functions': functions,
'consts': consts,
'structs': structs,
'rust_blocks': rust_blocks,
}
def render_rust(function_sigs: List[str], consts: List[Tuple[str, str]],
structs: List[Tuple[str, List[str]]], rust_blocks: List[List[str]]) -> str:
parts: List[str] = []
parts.append('#![allow(non_camel_case_types, dead_code, non_upper_case_globals, improper_ctypes)]')
parts.append('// Auto-generated Rust bindings from assembly source\n')
# Functions: wrap in single extern "C" block if any
if function_sigs:
parts.append('unsafe extern "C" {')
for sig in function_sigs:
parts.append(f' pub unsafe fn {sig};')
parts.append('}')
parts.append('') # blank line
# Consts
for name, value in consts:
parts.append(f'pub const {name}: u32 = {value};')
if consts:
parts.append('')
# Structs
for name, field_lines in structs:
parts.append(format_rust_struct(name, field_lines))
parts.append('') # blank line between structs
# Rust blocks copied verbatim (these are already uncommented and fence-less)
for block in rust_blocks:
# Ensure there's a blank line before inserted blocks for separation
if parts and parts[-1] != '':
parts.append('')
# append each line exactly as collected
parts.extend(line.rstrip('\n') for line in block)
parts.append('') # trailing blank line after block
# Trim trailing blank lines
while parts and parts[-1] == '':
parts.pop()
return '\n'.join(parts) + '\n' if parts else ''
def main(argv=None):
parser = argparse.ArgumentParser(description='Parse assembly files and emit Rust externs, consts, struct defs, and commented ```rust``` blocks.')
parser.add_argument('inputs', metavar='INPUT', type=Path, nargs='+', help='assembly source files to parse')
group = parser.add_mutually_exclusive_group()
group.add_argument('-o', '--out', type=Path, help='write combined Rust to this file (default stdout)')
group.add_argument('-d', '--out-dir', type=Path, help='write one .rs file per input into this directory')
args = parser.parse_args(argv)
combined_functions: List[str] = []
combined_consts: List[Tuple[str, str]] = []
combined_structs: List[Tuple[str, List[str]]] = []
combined_rust_blocks: List[List[str]] = []
per_file_output: Dict[Path, str] = {}
for inp in args.inputs:
if not inp.exists():
print(f'warning: input file {inp} does not exist, skipping', file=sys.stderr)
continue
parsed = parse_file(inp)
rust_src = render_rust(parsed['functions'], parsed['consts'], parsed['structs'], parsed['rust_blocks'])
per_file_output[inp] = rust_src
combined_functions.extend(parsed['functions'])
combined_consts.extend(parsed['consts'])
combined_structs.extend(parsed['structs'])
combined_rust_blocks.extend(parsed['rust_blocks'])
if args.out_dir:
outdir = args.out_dir
outdir.mkdir(parents=True, exist_ok=True)
for inp, src in per_file_output.items():
outpath = outdir / (inp.stem + '.rs')
with outpath.open('w', encoding='utf-8') as f:
f.write(src)
print(f'Wrote {outpath}', file=sys.stderr)
return 0
combined_src = render_rust(combined_functions, combined_consts, combined_structs, combined_rust_blocks)
if args.out:
with args.out.open('w', encoding='utf-8') as f:
f.write(combined_src)
print(f'Wrote {args.out}', file=sys.stderr)
else:
sys.stdout.write(combined_src)
return 0
if __name__ == '__main__':
raise SystemExit(main())