Pseudocode decompialtion improvements and docs

This commit is contained in:
MaddoScientisto 2026-03-26 22:10:48 +01:00
commit 589bfc31ef
1898 changed files with 60634 additions and 6597 deletions

View file

@ -12,7 +12,13 @@ if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from tools.poc_crusader_usecode_parser import EXTRACTED_ROOT, parse_body_ir, render_pseudocode
from tools.poc_crusader_usecode_parser import (
EXTRACTED_ROOT,
default_shape_catalog_path,
load_shape_catalog,
parse_body_ir,
render_pseudocode,
)
def load_rows(class_event_index: Path) -> list[dict[str, str]]:
@ -42,6 +48,14 @@ def safe_name(value: str) -> str:
return cleaned.strip("._") or "unknown"
def describe_row(row: dict[str, str]) -> str:
class_name = row.get("class_name_hint") or "unknown"
slot = parse_int(row.get("slot", "0"))
event_name = row.get("event_name_hint") or f"slot_{slot:02X}"
entry_index = row.get("entry_index", "?")
return f"entry {entry_index} {class_name}::${event_name}".replace("::$", "::") + f" (slot 0x{slot:02X})"
def output_path_for_row(output_root: Path, row: dict[str, str]) -> Path:
class_name = row["class_name_hint"]
slot = parse_int(row["slot"])
@ -119,35 +133,57 @@ def main() -> None:
default="auto",
help="Crusader intrinsic numbering to apply during export (default: auto, fallback regret)",
)
parser.add_argument(
"--shape-csv",
help=(
"Shape catalog CSV to apply to pseudocode output "
"(default: Remorse uses <extracted-root>/usecode_shape_catalog_remorse.csv; "
"Regret uses <extracted-root>/usecode_shape_catalog_regret.csv)"
),
)
args = parser.parse_args()
extracted_root = Path(args.extracted_root)
class_event_index = extracted_root / "class_event_index.tsv"
class_layout_index = extracted_root / "class_layout_index.tsv"
output_root = Path(args.output_dir) if args.output_dir else extracted_root / "pseudocode"
shape_csv = Path(args.shape_csv) if args.shape_csv else default_shape_catalog_path(extracted_root, args.variant)
shape_catalog = load_shape_catalog(shape_csv)
output_root.mkdir(parents=True, exist_ok=True)
rows = load_rows(class_event_index)
work_rows = [row for row in rows if row.get("derived_body_start") and row.get("derived_body_end")]
layout_by_entry = load_layout_by_entry(class_layout_index)
index_rows: list[dict[str, str]] = []
exported = 0
for row in rows:
if not row.get("derived_body_start") or not row.get("derived_body_end"):
continue
print(
f"Exporting pseudocode from {extracted_root} to {output_root} using variant={args.variant} and shape_csv={shape_csv}",
flush=True,
)
for position, row in enumerate(work_rows, start=1):
entry_index = parse_int(row["entry_index"])
layout_row = layout_by_entry.get(entry_index)
if layout_row is None:
print(
f"[{position}/{len(work_rows)}] Skipping {describe_row(row)} because no layout row was found",
flush=True,
)
continue
label = describe_row(row)
print(f"[{position}/{len(work_rows)}] Decoding {label}", flush=True)
ir = parse_body_ir(row, layout_row, None if args.variant == "auto" else args.variant, extracted_root)
pseudocode = render_pseudocode(ir)
print(f"[{position}/{len(work_rows)}] Rendering {label}", flush=True)
pseudocode = render_pseudocode(ir, shape_catalog=shape_catalog)
path = output_path_for_row(output_root, row)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(pseudocode, encoding="utf-8")
index_rows.append(build_index_row(output_root, row, path, ir))
exported += 1
print(f"[{position}/{len(work_rows)}] Wrote {path.relative_to(output_root.parent).as_posix()}", flush=True)
write_index(output_root, index_rows)
write_readme(output_root, exported)

View file

@ -6,6 +6,7 @@ import csv
import hashlib
import json
import re
from functools import lru_cache
from dataclasses import dataclass
from pathlib import Path
from typing import Any
@ -457,6 +458,158 @@ VARIANT_INTRINSIC_CALLSITE_HINTS: dict[str, dict[tuple[int, int], str]] = {
}
CLASS_EVENT_NAME_HINTS: dict[tuple[int, int], str] = {
(0x0A0C, 0x32): "waitNTimerTicks",
}
LOOP_SELECTOR_FIELD_HINTS = {
0x3A: "family",
0x40: "shape",
}
SHAPE_CATALOG_FILENAME = "usecode_shape_catalog.csv"
SHAPE_CATALOG_FILENAMES = {
"remorse": "usecode_shape_catalog_remorse.csv",
"regret": "usecode_shape_catalog_regret.csv",
}
SHAPE_CATALOG_FIELDNAMES = ["shape_code", "human_readable_id", "description"]
NUMERIC_SHAPE_LITERAL_PATTERN = r"(?:0x[0-9A-Fa-f]+|\d+)"
SHAPE_REFERENCE_PATTERNS = (
re.compile(rf"(?P<prefix>\bshape=)(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"),
re.compile(
rf"(?P<prefix>\bItem\.(?:getShape|getType)\([^\)\n]*\)\s*(?:==|!=|<=|>=|<|>)\s*)(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"
),
re.compile(rf"(?P<prefix>\bItem\.create\(\s*[^,\n]+,\s*)(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"),
re.compile(rf"(?P<prefix>\bItem\.legal_create\(\s*)(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"),
)
ShapeCatalog = dict[int, dict[str, str]]
def infer_shape_catalog_variant(extracted_root: Path | str | None = None, game_variant: str | None = None) -> str | None:
normalized = normalize_game_variant(game_variant)
if normalized is not None:
return normalized
root = resolve_extracted_root(extracted_root)
inferred = infer_game_variant_from_path(root)
if inferred is not None:
return inferred
try:
relative_root = root.resolve().relative_to(REPO_ROOT.resolve())
except ValueError:
relative_root = None
if relative_root is not None:
relative_parts = tuple(part.lower() for part in relative_root.parts)
if relative_parts[:2] == ("usecode", "eusecode_extracted"):
return "remorse"
if relative_parts[:3] == ("usecode", "regret", "regret_usecode_extracted"):
return "regret"
return None
def default_shape_catalog_path(
extracted_root: Path | str | None = None,
game_variant: str | None = None,
) -> Path:
root = resolve_extracted_root(extracted_root)
variant = infer_shape_catalog_variant(root, game_variant)
filename = SHAPE_CATALOG_FILENAMES.get(variant, SHAPE_CATALOG_FILENAME)
return root / filename
def format_shape_code(shape_code: int) -> str:
return f"0x{shape_code:04X}"
def load_shape_catalog(path: Path | str | None) -> ShapeCatalog:
if path is None:
return {}
shape_path = Path(path)
if not shape_path.exists():
return {}
catalog: ShapeCatalog = {}
with shape_path.open("r", encoding="utf-8", newline="") as handle:
reader = csv.DictReader(handle)
for row in reader:
shape_code = try_parse_int((row.get("shape_code") or "").strip())
if shape_code is None:
continue
catalog[shape_code] = {
"shape_code": format_shape_code(shape_code),
"human_readable_id": (row.get("human_readable_id") or "").strip(),
"description": row.get("description") or "",
}
return catalog
def shape_catalog_identifier(shape_code: int, shape_catalog: ShapeCatalog | None = None) -> str | None:
if not shape_catalog:
return None
row = shape_catalog.get(shape_code)
if row is None:
return None
human_readable_id = (row.get("human_readable_id") or "").strip()
if not human_readable_id:
return None
return sanitize_identifier(human_readable_id)
def format_shape_reference(
shape_code: int,
shape_catalog: ShapeCatalog | None = None,
frame_expr: str | None = None,
) -> str:
base = shape_catalog_identifier(shape_code, shape_catalog) or format_shape_code(shape_code)
if frame_expr is None or not frame_expr.strip():
return base
return f"{base}[{frame_expr}]"
def iter_shape_code_matches(text: str):
for pattern in SHAPE_REFERENCE_PATTERNS:
for match in pattern.finditer(text):
shape_code = try_parse_int(match.group("value"))
if shape_code is not None:
yield shape_code
def collect_shape_codes_from_pseudocode(text: str) -> set[int]:
return set(iter_shape_code_matches(text))
def apply_shape_catalog_to_pseudocode(text: str, shape_catalog: ShapeCatalog | None = None) -> str:
if not shape_catalog:
return text
def replace_match(match: re.Match[str]) -> str:
shape_code = try_parse_int(match.group("value"))
if shape_code is None:
return match.group(0)
shape_id = shape_catalog_identifier(shape_code, shape_catalog)
if shape_id is None:
return match.group(0)
return f"{match.group('prefix')}{shape_id}"
rendered = text
for pattern in SHAPE_REFERENCE_PATTERNS:
rendered = pattern.sub(replace_match, rendered)
return rendered
def generic_loop_selector_call(name: str, arguments: list[tuple[str, str]]) -> str:
rendered_args = ", ".join(f"{label}={expr}" for label, expr in arguments)
return f"{name}({rendered_args})"
def normalize_game_variant(value: str | None) -> str | None:
if value is None:
return None
@ -998,6 +1151,24 @@ def load_tsv_rows(path: Path) -> list[dict[str, str]]:
return list(csv.DictReader(handle, delimiter="\t"))
@lru_cache(maxsize=None)
def load_class_name_hints_by_id(extracted_root_key: str) -> dict[int, str]:
_, class_layout_index, _, _ = extracted_root_paths(Path(extracted_root_key))
hints: dict[int, str] = {}
for row in load_tsv_rows(class_layout_index):
class_id = try_parse_int(row.get("class_id", ""))
class_name = (row.get("class_name_hint") or "").strip()
if class_id is None or not class_name:
continue
hints[class_id] = class_name
return hints
def class_name_hints_by_id(extracted_root: Path | str | None = None) -> dict[int, str]:
root = resolve_extracted_root(extracted_root)
return load_class_name_hints_by_id(str(root))
def find_chunk_file(entry_index: int, extracted_root: Path | str | None = None) -> Path:
_, _, _, chunks_dir = extracted_root_paths(extracted_root)
matches = sorted(chunks_dir.glob(f"chunk_{entry_index:03d}_*.bin"))
@ -1174,6 +1345,7 @@ def parse_body_ir(
resolved_game_variant = resolve_game_variant(game_variant, chunk_file)
intrinsic_hints = get_intrinsic_hints(resolved_game_variant, chunk_file)
intrinsic_callsite_hints = get_intrinsic_callsite_hints(resolved_game_variant, chunk_file)
target_class_name_hints = class_name_hints_by_id(resolved_extracted_root)
body_start = parse_int(event_row["derived_body_start"])
body_end = parse_int(event_row["derived_body_end"])
@ -1190,6 +1362,10 @@ def parse_body_ir(
while offset < len(body):
result = parse_one_op(body, offset, intrinsic_hints, intrinsic_callsite_hints)
if result.op is not None:
operands = result.op["operands"]
if "target_class_id" in operands:
class_id = operands["target_class_id"]
operands["target_class_name_hint"] = target_class_name_hints.get(class_id)
result.op["absolute_body_offset"] = body_start + result.op["offset"]
ops.append(result.op)
if result.end_reason is not None:
@ -1598,9 +1774,7 @@ def format_script_statement(op: dict[str, Any], label_map: dict[int, str], body_
target = label_map.get(body_start + operands["target_offset"], f"0x{body_start + operands['target_offset']:04X}")
return f"call {target}"
if mnemonic == "call_class_event":
event_hint = operands.get("target_event_name_hint")
suffix = f" {event_hint}" if event_hint else ""
return f"call class 0x{operands['target_class_id']:04X}.slot 0x{operands['target_event_slot']:02X}{suffix}"
return f"call {format_target_event_reference(operands)}"
if mnemonic in {"append_unique_inline", "append_unique_indirect", "remove_matching_indirect", "remove_matching_inline"}:
return f"{mnemonic} size=0x{operands['element_size']:X}"
if mnemonic == "create_list":
@ -1617,17 +1791,13 @@ def format_script_statement(op: dict[str, Any], label_map: dict[int, str], body_
target = label_map.get(body_start + operands["target_offset"], f"0x{body_start + operands['target_offset']:04X}")
return f"{mnemonic} {operands['target_var']} elem_size=0x{operands['element_size']:X} -> {target}"
if mnemonic == "spawn":
event_hint = operands.get("target_event_name_hint")
suffix = f" {event_hint}" if event_hint else ""
return (
f"spawn class 0x{operands['target_class_id']:04X}.slot 0x{operands['target_event_slot']:02X}{suffix} "
f"spawn {format_target_event_reference(operands)} "
f"args=0x{operands['arg_bytes']:02X} this_size=0x{operands['this_size']:02X}"
)
if mnemonic == "spawn_inline":
event_hint = operands.get("target_event_name_hint")
suffix = f" {event_hint}" if event_hint else ""
return (
f"spawn_inline class 0x{operands['target_class_id']:04X}.slot 0x{operands['target_event_slot']:02X}{suffix} "
f"spawn_inline {format_target_event_reference(operands)} "
f"inline=0x{operands['inline_offset']:04X} this_size=0x{operands['this_size']:02X} unk=0x{operands['unknown']:02X}"
)
if mnemonic == "line_number":
@ -1701,6 +1871,20 @@ def sanitize_identifier(name: str) -> str:
return identifier
def target_event_display_name(operands: dict[str, Any]) -> str:
class_id = operands["target_class_id"]
slot = operands["target_event_slot"]
return CLASS_EVENT_NAME_HINTS.get((class_id, slot)) or operands.get("target_event_name_hint") or f"slot_{slot:02X}"
def format_target_event_reference(operands: dict[str, Any]) -> str:
class_name_hint = operands.get("target_class_name_hint")
event_name = sanitize_identifier(target_event_display_name(operands))
if class_name_hint:
return f"{sanitize_identifier(class_name_hint)}.{event_name}"
return f"class_{operands['target_class_id']:04X}_{event_name}"
def build_local_name_map(ir: dict[str, Any]) -> dict[int, str]:
return {
symbol["bp_offset"]: sanitize_identifier(symbol["name"])
@ -1782,6 +1966,105 @@ def combine_binary(stack: list[tuple[str, int]], operator: str, result_width: in
stack.append((f"({left_expr} {operator} {right_expr})", result_width))
def evaluate_loop_setup_op(
op: dict[str, Any],
stack: list[tuple[str, int]],
local_name_map: dict[int, str],
) -> bool:
pushed = push_expr_from_op(op, local_name_map)
if pushed is not None:
stack.append(pushed)
return True
mnemonic = op["mnemonic"]
operands = op["operands"]
if mnemonic == "push_indirect":
if stack:
expr, _ = stack.pop()
stack.append((f"*({expr})", max(1, operands["size"])))
return True
if mnemonic in {"add", "add_dword"}:
combine_binary(stack, "+", 4 if mnemonic.endswith("dword") else 2)
return True
if mnemonic in {"sub", "sub_dword"}:
combine_binary(stack, "-", 4 if mnemonic.endswith("dword") else 2)
return True
if mnemonic in {"mul", "mul_dword"}:
combine_binary(stack, "*", 4 if mnemonic.endswith("dword") else 2)
return True
if mnemonic in {"div", "div_dword"}:
combine_binary(stack, "/", 4 if mnemonic.endswith("dword") else 2)
return True
if mnemonic == "line_number":
return True
return False
def normalize_loop_origin(expr: str) -> str:
normalized = expr.strip()
if normalized.startswith("*(") and normalized.endswith(")"):
return normalized[2:-1]
return normalized
def try_decode_loop_selector(
ops: list[dict[str, Any]],
start_index: int,
local_name_map: dict[int, str],
) -> tuple[str, int] | None:
selector_tokens: list[int] = []
selector_stack: list[tuple[str, int]] = []
index = start_index
while index < len(ops):
op = ops[index]
mnemonic = op["mnemonic"]
if mnemonic == "loopscr":
selector_tokens.append(op["operands"]["value_u8"])
index += 1
continue
if mnemonic == "loop":
break
if not evaluate_loop_setup_op(op, selector_stack, local_name_map):
return None
index += 1
if index >= len(ops) or ops[index]["mnemonic"] != "loop":
return None
loop_operands = ops[index]["operands"]
if loop_operands.get("string_bytes") != 0x6 or loop_operands.get("loop_type") != 0x2:
return None
if len(selector_tokens) != 4 or selector_tokens[0] != 0x24 or selector_tokens[1] != 0x3D or selector_tokens[3] != 0x25:
if selector_tokens == [0x24, 0x42] and len(selector_stack) >= 4:
current_var = format_bp_name(loop_operands["current_var"], local_name_map)
return (
f"{current_var} in {generic_loop_selector_call('selector_0x42', [
('arg0', selector_stack[-4][0]),
('arg1', selector_stack[-3][0]),
('arg2', selector_stack[-2][0]),
('origin', normalize_loop_origin(selector_stack[-1][0])),
])}",
index + 1,
)
return None
selector_field = LOOP_SELECTOR_FIELD_HINTS.get(selector_tokens[2])
if selector_field is None or len(selector_stack) < 3:
return None
current_var = format_bp_name(loop_operands["current_var"], local_name_map)
selector_value = selector_stack[-3][0]
origin_expr = normalize_loop_origin(selector_stack[-1][0])
return (
f"{current_var} in nearby_items({selector_field}={selector_value}, origin={origin_expr})",
index + 1,
)
def loop_selector_statement(selector_text: str) -> str:
return f"/* loop_selector {selector_text} */"
def decompile_pseudocode_blocks(ir: dict[str, Any]) -> list[tuple[str, list[str]]]:
label_map, blocks = build_script_blocks(ir)
local_name_map = build_local_name_map(ir)
@ -1799,6 +2082,16 @@ def decompile_pseudocode_blocks(ir: dict[str, Any]) -> list[tuple[str, list[str]
mnemonic = op["mnemonic"]
operands = op["operands"]
if mnemonic == "loopscr":
decoded_loop = try_decode_loop_selector(ops, index, local_name_map)
if decoded_loop is not None:
selector_text, next_index = decoded_loop
block_lines.append(loop_selector_statement(selector_text))
stack.clear()
pending_result = None
index = next_index
continue
pushed = push_expr_from_op(op, local_name_map)
if pushed is not None:
stack.append(pushed)
@ -1855,10 +2148,7 @@ def decompile_pseudocode_blocks(ir: dict[str, Any]) -> list[tuple[str, list[str]
if mnemonic == "call_class_event":
arg_text = ", ".join(expr for expr, _ in stack)
stack.clear()
event_name = operands.get("target_event_name_hint") or f"slot_{operands['target_event_slot']:02X}"
block_lines.append(
f"class_{operands['target_class_id']:04X}_{sanitize_identifier(event_name)}({arg_text});"
)
block_lines.append(f"{format_target_event_reference(operands)}({arg_text});")
pending_result = None
index += 1
continue
@ -1866,10 +2156,7 @@ def decompile_pseudocode_blocks(ir: dict[str, Any]) -> list[tuple[str, list[str]
if mnemonic == "spawn":
arg_text = ", ".join(expr for expr, _ in stack)
stack.clear()
event_name = operands.get("target_event_name_hint") or f"slot_{operands['target_event_slot']:02X}"
block_lines.append(
f"spawn class_{operands['target_class_id']:04X}_{sanitize_identifier(event_name)}({arg_text});"
)
block_lines.append(f"spawn {format_target_event_reference(operands)}({arg_text});")
pending_result = None
index += 1
continue
@ -1877,9 +2164,8 @@ def decompile_pseudocode_blocks(ir: dict[str, Any]) -> list[tuple[str, list[str]
if mnemonic == "spawn_inline":
arg_text = ", ".join(expr for expr, _ in stack)
stack.clear()
event_name = operands.get("target_event_name_hint") or f"slot_{operands['target_event_slot']:02X}"
block_lines.append(
f"spawn_inline class_{operands['target_class_id']:04X}_{sanitize_identifier(event_name)}({arg_text}) /* inline=0x{operands['inline_offset']:04X} */;"
f"spawn_inline {format_target_event_reference(operands)}({arg_text}) /* inline=0x{operands['inline_offset']:04X} */;"
)
pending_result = None
index += 1
@ -2154,12 +2440,25 @@ def parse_selector_condition(condition: str) -> tuple[str, str] | None:
return match.group(1).strip(), match.group(2).strip()
def parse_loop_selector_statement(statement: str) -> str | None:
match = re.fullmatch(r"/\* loop_selector (.+) \*/", statement)
if match is None:
return None
return match.group(1)
def is_loop_selector_only_block(statements: list[str]) -> bool:
return len(statements) == 1 and parse_loop_selector_statement(statements[0]) is not None
def render_selector_chain(
blocks: list[tuple[str, list[str]]],
label_to_index: dict[str, int],
start_index: int,
end_index: int,
return_labels: set[str],
active_regions: set[tuple[int, int, tuple[str, ...]]] | None = None,
render_cache: dict[tuple[int, int, tuple[str, ...]], tuple[list[str], bool] | None] | None = None,
) -> tuple[list[str], int] | None:
if not blocks[start_index][1]:
return None
@ -2220,6 +2519,8 @@ def render_selector_chain(
target_index,
return_labels,
{join_label},
active_regions,
render_cache,
)
if body_result is None:
return None
@ -2250,7 +2551,19 @@ def render_structured_region(
end_index: int,
return_labels: set[str],
exit_labels: set[str] | None = None,
active_regions: set[tuple[int, int, tuple[str, ...]]] | None = None,
render_cache: dict[tuple[int, int, tuple[str, ...]], tuple[list[str], bool] | None] | None = None,
) -> tuple[list[str], bool] | None:
region_key = (start_index, end_index, tuple(sorted(exit_labels or ())))
if render_cache is not None and region_key in render_cache:
return render_cache[region_key]
if active_regions is None:
active_regions = set()
elif region_key in active_regions:
return None
active_regions = set(active_regions)
active_regions.add(region_key)
allowed_exit_labels = set(exit_labels or ())
lines: list[str] = []
index = start_index
@ -2265,6 +2578,9 @@ def render_structured_region(
if not statements:
index += 1
continue
if is_loop_selector_only_block(statements):
index += 1
continue
terminal = parse_terminal_statement(statements[-1])
if terminal is None:
@ -2304,7 +2620,15 @@ def render_structured_region(
index += 1
continue
selector_chain = render_selector_chain(blocks, label_to_index, index, end_index, return_labels)
selector_chain = render_selector_chain(
blocks,
label_to_index,
index,
end_index,
return_labels,
active_regions,
render_cache,
)
if selector_chain is not None:
selector_lines, selector_join_index = selector_chain
lines.extend(selector_lines)
@ -2323,10 +2647,18 @@ def render_structured_region(
target_index,
return_labels,
{blocks[index][0]},
active_regions,
render_cache,
)
if loop_body is not None:
loop_lines, _ = loop_body
lines.append(f"while ({invert_condition_text(terminal.condition or 'condition')}) {{")
loop_selector = None
if index > start_index:
loop_selector = parse_loop_selector_statement(blocks[index - 1][1][0]) if is_loop_selector_only_block(blocks[index - 1][1]) else None
if loop_selector is not None:
lines.append(f"for {loop_selector} {{")
else:
lines.append(f"while ({invert_condition_text(terminal.condition or 'condition')}) {{")
lines.extend(indent_lines(loop_lines))
lines.append("}")
index = target_index
@ -2346,6 +2678,8 @@ def render_structured_region(
target_index,
return_labels,
{join_label},
active_regions,
render_cache,
)
false_result = render_structured_region(
blocks,
@ -2354,6 +2688,8 @@ def render_structured_region(
join_index,
return_labels,
{join_label},
active_regions,
render_cache,
)
if true_result is not None and false_result is not None:
true_lines, _ = true_result
@ -2372,8 +2708,19 @@ def render_structured_region(
index = join_index
continue
inner_result = render_structured_region(blocks, label_to_index, index + 1, target_index, return_labels)
inner_result = render_structured_region(
blocks,
label_to_index,
index + 1,
target_index,
return_labels,
None,
active_regions,
render_cache,
)
if inner_result is None:
if render_cache is not None:
render_cache[region_key] = None
return None
inner_lines, inner_falls_through = inner_result
@ -2387,7 +2734,10 @@ def render_structured_region(
index = target_index
return lines, True
result = (lines, True)
if render_cache is not None:
render_cache[region_key] = result
return result
def render_structured_pseudocode(blocks: list[tuple[str, list[str]]]) -> list[str] | None:
@ -2400,7 +2750,8 @@ def render_structured_pseudocode(blocks: list[tuple[str, list[str]]]) -> list[st
for label, statements in blocks
if len(statements) == 1 and statements[0] == "return;"
}
structured = render_structured_region(blocks, label_to_index, 0, len(blocks), return_labels)
render_cache: dict[tuple[int, int, tuple[str, ...]], tuple[list[str], bool] | None] = {}
structured = render_structured_region(blocks, label_to_index, 0, len(blocks), return_labels, None, None, render_cache)
if structured is None:
return None
return structured[0]
@ -2421,6 +2772,41 @@ def render_partially_structured_blocks(blocks: list[tuple[str, list[str]]]) -> l
index = 0
while index < len(blocks):
label, statements = blocks[index]
if is_loop_selector_only_block(statements):
loop_selector = parse_loop_selector_statement(statements[0])
if loop_selector is not None and index + 1 < len(blocks):
next_label, next_statements = blocks[index + 1]
next_terminal = parse_terminal_statement(next_statements[-1]) if next_statements else None
if next_terminal is not None and next_terminal.kind == "if":
target_index = label_to_index.get(next_terminal.target or "")
if target_index is not None and target_index > index + 1:
loop_tail_index = last_nonempty_block_index(blocks, index + 2, target_index)
if loop_tail_index is not None:
loop_tail_terminal = parse_terminal_statement(blocks[loop_tail_index][1][-1])
if loop_tail_terminal is not None and loop_tail_terminal.kind == "goto" and loop_tail_terminal.target == next_label:
loop_body = render_structured_region(
blocks,
label_to_index,
index + 2,
target_index,
return_labels,
{next_label},
)
if loop_body is not None:
loop_lines, _ = loop_body
lines.append(f" {label}:")
lines.append(f" for {loop_selector} {{")
lines.extend(f" {line}" for line in indent_lines(loop_lines))
lines.append(" }")
lines.append("")
index = target_index
continue
lines.append(f" {label}:")
lines.append(f" {statements[0]}")
lines.append("")
index += 1
continue
selector_chain = render_selector_chain(blocks, label_to_index, index, len(blocks), return_labels)
if selector_chain is not None:
selector_lines, selector_join_index = selector_chain
@ -2440,7 +2826,7 @@ def render_partially_structured_blocks(blocks: list[tuple[str, list[str]]]) -> l
return lines
def render_pseudocode(ir: dict[str, Any]) -> str:
def render_pseudocode(ir: dict[str, Any], shape_catalog: ShapeCatalog | None = None) -> str:
slot_name = sanitize_identifier(ir["event"]["event_name_hint"] or f"slot_{ir['event']['slot']:02X}")
lines = [
(
@ -2466,7 +2852,7 @@ def render_pseudocode(ir: dict[str, Any]) -> str:
lines.extend(render_partially_structured_blocks(rendered_blocks))
lines.append("}")
return "\n".join(lines) + "\n"
return apply_shape_catalog_to_pseudocode("\n".join(lines) + "\n", shape_catalog)
def render_text(ir: dict[str, Any]) -> str:
@ -2525,6 +2911,14 @@ def main() -> None:
parser.add_argument("--slot", required=True, help="Event slot, for example 0x0A")
parser.add_argument("--extracted-root", default=str(EXTRACTED_ROOT), help="Extracted USECODE root containing class_event_index.tsv and chunks/")
parser.add_argument("--variant", choices=["auto", "regret", "remorse"], default="auto", help="Crusader intrinsic numbering to apply (default: auto, fallback regret)")
parser.add_argument(
"--shape-csv",
help=(
"Shape catalog CSV to apply to pseudocode output "
"(default: Remorse uses <extracted-root>/usecode_shape_catalog_remorse.csv; "
"Regret uses <extracted-root>/usecode_shape_catalog_regret.csv)"
),
)
parser.add_argument("--output", help="Write IR JSON to this file instead of stdout")
parser.add_argument("--emit-text", action="store_true", help="Emit a readable text listing beside the JSON")
parser.add_argument("--text-output", help="Write the text listing to this file")
@ -2539,6 +2933,8 @@ def main() -> None:
slot = parse_int(args.slot)
extracted_root = Path(args.extracted_root)
shape_csv = Path(args.shape_csv) if args.shape_csv else default_shape_catalog_path(extracted_root, args.variant)
shape_catalog = load_shape_catalog(shape_csv)
event_row, layout_row = select_rows(args.class_name, slot, extracted_root)
ir = parse_body_ir(event_row, layout_row, None if args.variant == "auto" else args.variant, extracted_root)
@ -2563,7 +2959,7 @@ def main() -> None:
print(rendered_script)
if args.emit_pseudocode:
rendered_pseudocode = render_pseudocode(ir)
rendered_pseudocode = render_pseudocode(ir, shape_catalog=shape_catalog)
if args.pseudocode_output:
Path(args.pseudocode_output).write_text(rendered_pseudocode, encoding="utf-8")
else:

View file

@ -3,10 +3,12 @@ from __future__ import annotations
import unittest
from tools.poc_crusader_usecode_parser import (
format_target_event_reference,
get_intrinsic_hints,
intrinsic_display_name,
render_partially_structured_blocks,
render_structured_pseudocode,
try_decode_loop_selector,
)
@ -91,6 +93,78 @@ class UsecodeStructuringTests(unittest.TestCase):
self.assertIn("while (!condition) {", text)
self.assertNotIn("goto block_0118;", text)
def test_loop_selector_block_renders_as_for_loop(self) -> None:
blocks = [
("entry", ["/* loop_selector item in nearby_items(shape=0x04D0, origin=arg_06) */"]),
("block_0118", ["if condition goto block_0151;"]),
("block_011B", ["if (Item.getFrame(item) != 0) goto block_014D;", "suspend;"]),
("block_014D", ["goto block_0118;"]),
("block_0151", ["return;"]),
]
rendered = render_structured_pseudocode(blocks)
self.assertIsNotNone(rendered)
text = "\n".join(rendered or [])
self.assertIn("for item in nearby_items(shape=0x04D0, origin=arg_06) {", text)
self.assertNotIn("while (!condition) {", text)
def test_loop_selector_renders_in_partial_fallback(self) -> None:
blocks = [
("entry", ["/* loop_selector item in nearby_items(shape=0x04D0, origin=arg_06) */"]),
("block_0118", ["if condition goto block_0151;"]),
("block_011B", ["if other goto block_014D;", "suspend;"]),
("block_014D", ["goto block_0118;"]),
("block_0151", ["goto block_0200;"]),
("block_0200", ["return;"]),
]
rendered = render_partially_structured_blocks(blocks)
text = "\n".join(rendered)
self.assertIn("entry:", text)
self.assertIn("for item in nearby_items(shape=0x04D0, origin=arg_06) {", text)
def test_target_event_reference_prefers_alias_and_class_name(self) -> None:
target = format_target_event_reference(
{
"target_class_id": 0x0A0C,
"target_class_name_hint": "FREE",
"target_event_slot": 0x32,
"target_event_name_hint": None,
}
)
self.assertEqual(target, "FREE.waitNTimerTicks")
def test_selector_0x42_decodes_to_readable_fallback(self) -> None:
decoded = try_decode_loop_selector(
[
{"mnemonic": "loopscr", "operands": {"value_u8": 0x24}},
{"mnemonic": "push_word_immediate", "operands": {"value_u16": 0x04C8}},
{"mnemonic": "push_word_immediate", "operands": {"value_u16": 0x01CD}},
{"mnemonic": "loopscr", "operands": {"value_u8": 0x42}},
{"mnemonic": "push_byte_immediate", "operands": {"value_u8": 0x32, "value_signed": 50}},
{"mnemonic": "push_byte_immediate", "operands": {"value_u8": 0x20, "value_signed": 32}},
{"mnemonic": "mul", "operands": {}},
{"mnemonic": "push_local_word", "operands": {"bp_offset": 0x0A}},
{
"mnemonic": "loop",
"operands": {"current_var": 0xFE, "string_bytes": 0x6, "loop_type": 0x2},
},
],
0,
{0xFE: "n", 0x0A: "eventTrigger"},
)
self.assertEqual(
decoded,
(
"n in selector_0x42(arg0=0x04C8, arg1=0x01CD, arg2=(50 * 32), origin=eventTrigger)",
9,
),
)
def test_selector_ladder_renders_as_else_if_chain(self) -> None:
blocks = [
("entry", ["if (dir != 0) goto block_0358;"]),

View file

@ -0,0 +1,228 @@
from __future__ import annotations
import argparse
import csv
import re
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from tools.poc_crusader_usecode_parser import (
EXTRACTED_ROOT,
NUMERIC_SHAPE_LITERAL_PATTERN,
SHAPE_CATALOG_FIELDNAMES,
collect_shape_codes_from_pseudocode,
default_shape_catalog_path,
format_shape_code,
load_tsv_rows,
parse_body_ir,
parse_int,
render_pseudocode,
sanitize_identifier,
try_parse_int,
)
SELF_SHAPE_COMPARISON_PATTERN = re.compile(
rf"\bItem\.(?:getShape|getType)\(\s*arg_06\s*\)\s*(?:==|!=|<=|>=|<|>)\s*(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"
)
def describe_row(row: dict[str, str]) -> str:
class_name = row.get("class_name_hint") or "unknown"
slot = parse_int(row.get("slot", "0"))
event_name = row.get("event_name_hint") or f"slot_{slot:02X}"
entry_index = row.get("entry_index", "?")
return f"entry {entry_index} {class_name}::${event_name}".replace("::$", "::") + f" (slot 0x{slot:02X})"
def load_layout_by_entry(class_layout_index: Path) -> dict[int, dict[str, str]]:
rows = load_tsv_rows(class_layout_index)
layout_by_entry: dict[int, dict[str, str]] = {}
for row in rows:
entry_index = try_parse_int(row.get("entry_index", ""))
if entry_index is None:
continue
layout_by_entry[entry_index] = row
return layout_by_entry
def load_existing_catalog(csv_path: Path) -> tuple[list[str], list[dict[str, str]], set[int]]:
if not csv_path.exists():
return SHAPE_CATALOG_FIELDNAMES[:], [], set()
with csv_path.open("r", encoding="utf-8", newline="") as handle:
reader = csv.DictReader(handle)
fieldnames = [name for name in (reader.fieldnames or []) if name] or SHAPE_CATALOG_FIELDNAMES[:]
for required_name in SHAPE_CATALOG_FIELDNAMES:
if required_name not in fieldnames:
fieldnames.append(required_name)
rows: list[dict[str, str]] = []
existing_codes: set[int] = set()
for raw_row in reader:
row = {fieldname: raw_row.get(fieldname, "") for fieldname in fieldnames}
shape_code = try_parse_int((row.get("shape_code") or "").strip())
if shape_code is not None:
if shape_code in existing_codes:
continue
row["shape_code"] = format_shape_code(shape_code)
existing_codes.add(shape_code)
rows.append(row)
return fieldnames, rows, existing_codes
def write_catalog(csv_path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
csv_path.parent.mkdir(parents=True, exist_ok=True)
with csv_path.open("w", encoding="utf-8", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
def scan_exported_pseudocode_shape_codes(extracted_root: Path) -> set[int]:
pseudocode_root = extracted_root / "pseudocode"
if not pseudocode_root.exists():
print(f"No exported pseudocode directory at {pseudocode_root}; skipping pre-scan", flush=True)
return set()
shape_codes: set[int] = set()
pseudocode_paths = sorted(pseudocode_root.rglob("*.txt"))
print(f"Scanning {len(pseudocode_paths)} exported pseudocode files under {pseudocode_root}", flush=True)
for index, pseudocode_path in enumerate(pseudocode_paths, start=1):
print(f"[pseudocode {index}/{len(pseudocode_paths)}] {pseudocode_path.relative_to(extracted_root).as_posix()}", flush=True)
shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode_path.read_text(encoding="utf-8")))
return shape_codes
def auto_shape_identifier(class_name: str, shape_code: int) -> str:
class_id = sanitize_identifier((class_name or "shape").lower())
return sanitize_identifier(f"{class_id}_shape_{shape_code:04x}")
def collect_shape_suggestions_from_pseudocode(text: str, row: dict[str, str]) -> dict[int, dict[str, str]]:
class_name = (row.get("class_name_hint") or "").strip()
if not class_name:
return {}
suggestions: dict[int, dict[str, str]] = {}
for match in SELF_SHAPE_COMPARISON_PATTERN.finditer(text):
shape_code = try_parse_int(match.group("value"))
if shape_code is None:
continue
suggestions[shape_code] = {
"human_readable_id": auto_shape_identifier(class_name, shape_code),
"description": f"Auto-derived from {class_name} self-shape comparison in USECODE",
}
return suggestions
def scan_shape_codes(extracted_root: Path, variant: str) -> tuple[set[int], dict[int, dict[str, str]]]:
class_event_index = extracted_root / "class_event_index.tsv"
class_layout_index = extracted_root / "class_layout_index.tsv"
rows = load_tsv_rows(class_event_index)
work_rows = [row for row in rows if row.get("derived_body_start") and row.get("derived_body_end")]
layout_by_entry = load_layout_by_entry(class_layout_index)
shape_codes = scan_exported_pseudocode_shape_codes(extracted_root)
suggestion_candidates: dict[int, dict[str, dict[str, str]]] = {}
print(f"Scanning {len(work_rows)} decoded USECODE bodies from {extracted_root}", flush=True)
for position, row in enumerate(work_rows, start=1):
entry_index = parse_int(row["entry_index"])
layout_row = layout_by_entry.get(entry_index)
if layout_row is None:
print(
f"[{position}/{len(work_rows)}] Skipping {describe_row(row)} because no layout row was found",
flush=True,
)
continue
label = describe_row(row)
print(f"[{position}/{len(work_rows)}] Decoding {label}", flush=True)
ir = parse_body_ir(row, layout_row, None if variant == "auto" else variant, extracted_root)
print(f"[{position}/{len(work_rows)}] Rendering {label}", flush=True)
pseudocode = render_pseudocode(ir)
shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode))
for shape_code, suggestion in collect_shape_suggestions_from_pseudocode(pseudocode, row).items():
suggestion_candidates.setdefault(shape_code, {})[suggestion["human_readable_id"]] = suggestion
resolved_suggestions: dict[int, dict[str, str]] = {}
for shape_code, candidates in suggestion_candidates.items():
if len(candidates) == 1:
resolved_suggestions[shape_code] = next(iter(candidates.values()))
return shape_codes, resolved_suggestions
def main() -> None:
parser = argparse.ArgumentParser(description="Generate or update the append-only USECODE shape catalog CSV")
parser.add_argument(
"--extracted-root",
default=str(EXTRACTED_ROOT),
help="Extracted USECODE root containing class_event_index.tsv and chunks/",
)
parser.add_argument(
"--output-csv",
help=(
"Catalog CSV path "
"(default: Remorse uses <extracted-root>/usecode_shape_catalog_remorse.csv; "
"Regret uses <extracted-root>/usecode_shape_catalog_regret.csv)"
),
)
parser.add_argument(
"--variant",
choices=["auto", "regret", "remorse"],
default="auto",
help="Crusader intrinsic numbering to apply during scanning (default: auto, fallback regret)",
)
args = parser.parse_args()
extracted_root = Path(args.extracted_root)
output_csv = Path(args.output_csv) if args.output_csv else default_shape_catalog_path(extracted_root, args.variant)
print(
f"Updating shape catalog {output_csv} from extracted_root={extracted_root} using variant={args.variant}",
flush=True,
)
shape_codes, suggested_rows = scan_shape_codes(extracted_root, args.variant)
fieldnames, rows, existing_codes = load_existing_catalog(output_csv)
backfilled = 0
for row in rows:
shape_code = try_parse_int((row.get("shape_code") or "").strip())
if shape_code is None:
continue
suggestion = suggested_rows.get(shape_code)
if suggestion is None:
continue
if not (row.get("human_readable_id") or "").strip():
row["human_readable_id"] = suggestion["human_readable_id"]
backfilled += 1
if not (row.get("description") or "").strip():
row["description"] = suggestion["description"]
missing_codes = sorted(shape_codes - existing_codes)
for shape_code in missing_codes:
row = {fieldname: "" for fieldname in fieldnames}
row["shape_code"] = format_shape_code(shape_code)
suggestion = suggested_rows.get(shape_code)
if suggestion is not None:
row["human_readable_id"] = suggestion["human_readable_id"]
row["description"] = suggestion["description"]
rows.append(row)
write_catalog(output_csv, fieldnames, rows)
print(
f"Catalog {output_csv} now contains {len(rows)} rows; "
f"scanned {len(shape_codes)} distinct shape codes, added {len(missing_codes)} new rows, "
f"and backfilled {backfilled} existing rows."
)
if __name__ == "__main__":
main()