from __future__ import annotations import argparse import csv import re import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from tools.poc_crusader_usecode_parser import ( EXTRACTED_ROOT, NUMERIC_SHAPE_LITERAL_PATTERN, SHAPE_CATALOG_FIELDNAMES, collect_shape_codes_from_pseudocode, default_shape_catalog_path, format_shape_code, load_tsv_rows, parse_body_ir, parse_int, render_pseudocode, sanitize_identifier, try_parse_int, ) SELF_SHAPE_COMPARISON_PATTERN = re.compile( rf"\bItem\.(?:getShape|getType)\(\s*arg_06\s*\)\s*(?:==|!=|<=|>=|<|>)\s*(?P{NUMERIC_SHAPE_LITERAL_PATTERN})\b" ) def describe_row(row: dict[str, str]) -> str: class_name = row.get("class_name_hint") or "unknown" slot = parse_int(row.get("slot", "0")) event_name = row.get("event_name_hint") or f"slot_{slot:02X}" entry_index = row.get("entry_index", "?") return f"entry {entry_index} {class_name}::${event_name}".replace("::$", "::") + f" (slot 0x{slot:02X})" def load_layout_by_entry(class_layout_index: Path) -> dict[int, dict[str, str]]: rows = load_tsv_rows(class_layout_index) layout_by_entry: dict[int, dict[str, str]] = {} for row in rows: entry_index = try_parse_int(row.get("entry_index", "")) if entry_index is None: continue layout_by_entry[entry_index] = row return layout_by_entry def load_existing_catalog(csv_path: Path) -> tuple[list[str], list[dict[str, str]], set[int]]: if not csv_path.exists(): return SHAPE_CATALOG_FIELDNAMES[:], [], set() with csv_path.open("r", encoding="utf-8", newline="") as handle: reader = csv.DictReader(handle) fieldnames = [name for name in (reader.fieldnames or []) if name] or SHAPE_CATALOG_FIELDNAMES[:] for required_name in SHAPE_CATALOG_FIELDNAMES: if required_name not in fieldnames: fieldnames.append(required_name) rows: list[dict[str, str]] = [] existing_codes: set[int] = set() for raw_row in reader: row = {fieldname: raw_row.get(fieldname, "") for fieldname in fieldnames} shape_code = try_parse_int((row.get("shape_code") or "").strip()) if shape_code is not None: if shape_code in existing_codes: continue row["shape_code"] = format_shape_code(shape_code) existing_codes.add(shape_code) rows.append(row) return fieldnames, rows, existing_codes def write_catalog(csv_path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None: csv_path.parent.mkdir(parents=True, exist_ok=True) with csv_path.open("w", encoding="utf-8", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) def scan_exported_pseudocode_shape_codes(extracted_root: Path) -> set[int]: pseudocode_root = extracted_root / "pseudocode" if not pseudocode_root.exists(): print(f"No exported pseudocode directory at {pseudocode_root}; skipping pre-scan", flush=True) return set() shape_codes: set[int] = set() pseudocode_paths = sorted(pseudocode_root.rglob("*.txt")) print(f"Scanning {len(pseudocode_paths)} exported pseudocode files under {pseudocode_root}", flush=True) for index, pseudocode_path in enumerate(pseudocode_paths, start=1): print(f"[pseudocode {index}/{len(pseudocode_paths)}] {pseudocode_path.relative_to(extracted_root).as_posix()}", flush=True) shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode_path.read_text(encoding="utf-8"))) return shape_codes def auto_shape_identifier(class_name: str, shape_code: int) -> str: class_id = sanitize_identifier((class_name or "shape").lower()) return sanitize_identifier(f"{class_id}_shape_{shape_code:04x}") def collect_shape_suggestions_from_pseudocode(text: str, row: dict[str, str]) -> dict[int, dict[str, str]]: class_name = (row.get("class_name_hint") or "").strip() if not class_name: return {} suggestions: dict[int, dict[str, str]] = {} for match in SELF_SHAPE_COMPARISON_PATTERN.finditer(text): shape_code = try_parse_int(match.group("value")) if shape_code is None: continue suggestions[shape_code] = { "human_readable_id": auto_shape_identifier(class_name, shape_code), "description": f"Auto-derived from {class_name} self-shape comparison in USECODE", } return suggestions def scan_shape_codes(extracted_root: Path, variant: str) -> tuple[set[int], dict[int, dict[str, str]]]: class_event_index = extracted_root / "class_event_index.tsv" class_layout_index = extracted_root / "class_layout_index.tsv" rows = load_tsv_rows(class_event_index) work_rows = [row for row in rows if row.get("derived_body_start") and row.get("derived_body_end")] layout_by_entry = load_layout_by_entry(class_layout_index) shape_codes = scan_exported_pseudocode_shape_codes(extracted_root) suggestion_candidates: dict[int, dict[str, dict[str, str]]] = {} print(f"Scanning {len(work_rows)} decoded USECODE bodies from {extracted_root}", flush=True) for position, row in enumerate(work_rows, start=1): entry_index = parse_int(row["entry_index"]) layout_row = layout_by_entry.get(entry_index) if layout_row is None: print( f"[{position}/{len(work_rows)}] Skipping {describe_row(row)} because no layout row was found", flush=True, ) continue label = describe_row(row) print(f"[{position}/{len(work_rows)}] Decoding {label}", flush=True) ir = parse_body_ir(row, layout_row, None if variant == "auto" else variant, extracted_root) print(f"[{position}/{len(work_rows)}] Rendering {label}", flush=True) pseudocode = render_pseudocode(ir) shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode)) for shape_code, suggestion in collect_shape_suggestions_from_pseudocode(pseudocode, row).items(): suggestion_candidates.setdefault(shape_code, {})[suggestion["human_readable_id"]] = suggestion resolved_suggestions: dict[int, dict[str, str]] = {} for shape_code, candidates in suggestion_candidates.items(): if len(candidates) == 1: resolved_suggestions[shape_code] = next(iter(candidates.values())) return shape_codes, resolved_suggestions def main() -> None: parser = argparse.ArgumentParser(description="Generate or update the append-only USECODE shape catalog CSV") parser.add_argument( "--extracted-root", default=str(EXTRACTED_ROOT), help="Extracted USECODE root containing class_event_index.tsv and chunks/", ) parser.add_argument( "--output-csv", help=( "Catalog CSV path " "(default: Remorse uses /usecode_shape_catalog_remorse.csv; " "Regret uses /usecode_shape_catalog_regret.csv)" ), ) parser.add_argument( "--variant", choices=["auto", "regret", "remorse"], default="auto", help="Crusader intrinsic numbering to apply during scanning (default: auto, fallback regret)", ) args = parser.parse_args() extracted_root = Path(args.extracted_root) output_csv = Path(args.output_csv) if args.output_csv else default_shape_catalog_path(extracted_root, args.variant) print( f"Updating shape catalog {output_csv} from extracted_root={extracted_root} using variant={args.variant}", flush=True, ) shape_codes, suggested_rows = scan_shape_codes(extracted_root, args.variant) fieldnames, rows, existing_codes = load_existing_catalog(output_csv) backfilled = 0 for row in rows: shape_code = try_parse_int((row.get("shape_code") or "").strip()) if shape_code is None: continue suggestion = suggested_rows.get(shape_code) if suggestion is None: continue if not (row.get("human_readable_id") or "").strip(): row["human_readable_id"] = suggestion["human_readable_id"] backfilled += 1 if not (row.get("description") or "").strip(): row["description"] = suggestion["description"] missing_codes = sorted(shape_codes - existing_codes) for shape_code in missing_codes: row = {fieldname: "" for fieldname in fieldnames} row["shape_code"] = format_shape_code(shape_code) suggestion = suggested_rows.get(shape_code) if suggestion is not None: row["human_readable_id"] = suggestion["human_readable_id"] row["description"] = suggestion["description"] rows.append(row) write_catalog(output_csv, fieldnames, rows) print( f"Catalog {output_csv} now contains {len(rows)} rows; " f"scanned {len(shape_codes)} distinct shape codes, added {len(missing_codes)} new rows, " f"and backfilled {backfilled} existing rows." ) if __name__ == "__main__": main()