Crusader_Decomp/tools/update_usecode_shape_catalog.py

from __future__ import annotations

import argparse
import csv
import re
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


from tools.poc_crusader_usecode_parser import (
    EXTRACTED_ROOT,
    NUMERIC_SHAPE_LITERAL_PATTERN,
    SHAPE_CATALOG_FIELDNAMES,
    collect_shape_codes_from_pseudocode,
    default_shape_catalog_path,
    format_shape_code,
    load_tsv_rows,
    parse_body_ir,
    parse_int,
    render_pseudocode,
    sanitize_identifier,
    try_parse_int,
)


SELF_SHAPE_COMPARISON_PATTERN = re.compile(
    rf"\bItem\.(?:getShape|getType)\(\s*arg_06\s*\)\s*(?:==|!=|<=|>=|<|>)\s*(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"
)


def describe_row(row: dict[str, str]) -> str:
    class_name = row.get("class_name_hint") or "unknown"
    slot = parse_int(row.get("slot", "0"))
    event_name = row.get("event_name_hint") or f"slot_{slot:02X}"
    entry_index = row.get("entry_index", "?")
    return f"entry {entry_index} {class_name}::${event_name}".replace("::$", "::") + f" (slot 0x{slot:02X})"


def load_layout_by_entry(class_layout_index: Path) -> dict[int, dict[str, str]]:
    rows = load_tsv_rows(class_layout_index)
    layout_by_entry: dict[int, dict[str, str]] = {}
    for row in rows:
        entry_index = try_parse_int(row.get("entry_index", ""))
        if entry_index is None:
            continue
        layout_by_entry[entry_index] = row
    return layout_by_entry


def load_existing_catalog(csv_path: Path) -> tuple[list[str], list[dict[str, str]], set[int]]:
    if not csv_path.exists():
        return SHAPE_CATALOG_FIELDNAMES[:], [], set()

    with csv_path.open("r", encoding="utf-8", newline="") as handle:
        reader = csv.DictReader(handle)
        fieldnames = [name for name in (reader.fieldnames or []) if name] or SHAPE_CATALOG_FIELDNAMES[:]
        for required_name in SHAPE_CATALOG_FIELDNAMES:
            if required_name not in fieldnames:
                fieldnames.append(required_name)

        rows: list[dict[str, str]] = []
        existing_codes: set[int] = set()
        for raw_row in reader:
            row = {fieldname: raw_row.get(fieldname, "") for fieldname in fieldnames}
            shape_code = try_parse_int((row.get("shape_code") or "").strip())
            if shape_code is not None:
                if shape_code in existing_codes:
                    continue
                row["shape_code"] = format_shape_code(shape_code)
                existing_codes.add(shape_code)
            rows.append(row)
    return fieldnames, rows, existing_codes


def write_catalog(csv_path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    with csv_path.open("w", encoding="utf-8", newline="") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def scan_exported_pseudocode_shape_codes(extracted_root: Path) -> set[int]:
    pseudocode_root = extracted_root / "pseudocode"
    if not pseudocode_root.exists():
        print(f"No exported pseudocode directory at {pseudocode_root}; skipping pre-scan", flush=True)
        return set()

    shape_codes: set[int] = set()
    pseudocode_paths = sorted(pseudocode_root.rglob("*.txt"))
    print(f"Scanning {len(pseudocode_paths)} exported pseudocode files under {pseudocode_root}", flush=True)
    for index, pseudocode_path in enumerate(pseudocode_paths, start=1):
        print(f"[pseudocode {index}/{len(pseudocode_paths)}] {pseudocode_path.relative_to(extracted_root).as_posix()}", flush=True)
        shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode_path.read_text(encoding="utf-8")))
    return shape_codes


def auto_shape_identifier(class_name: str, shape_code: int) -> str:
    class_id = sanitize_identifier((class_name or "shape").lower())
    return sanitize_identifier(f"{class_id}_shape_{shape_code:04x}")


def collect_shape_suggestions_from_pseudocode(text: str, row: dict[str, str]) -> dict[int, dict[str, str]]:
    class_name = (row.get("class_name_hint") or "").strip()
    if not class_name:
        return {}

    suggestions: dict[int, dict[str, str]] = {}
    for match in SELF_SHAPE_COMPARISON_PATTERN.finditer(text):
        shape_code = try_parse_int(match.group("value"))
        if shape_code is None:
            continue
        suggestions[shape_code] = {
            "human_readable_id": auto_shape_identifier(class_name, shape_code),
            "description": f"Auto-derived from {class_name} self-shape comparison in USECODE",
        }
    return suggestions


def scan_shape_codes(extracted_root: Path, variant: str) -> tuple[set[int], dict[int, dict[str, str]]]:
    class_event_index = extracted_root / "class_event_index.tsv"
    class_layout_index = extracted_root / "class_layout_index.tsv"
    rows = load_tsv_rows(class_event_index)
    work_rows = [row for row in rows if row.get("derived_body_start") and row.get("derived_body_end")]
    layout_by_entry = load_layout_by_entry(class_layout_index)
    shape_codes = scan_exported_pseudocode_shape_codes(extracted_root)
    suggestion_candidates: dict[int, dict[str, dict[str, str]]] = {}
    print(f"Scanning {len(work_rows)} decoded USECODE bodies from {extracted_root}", flush=True)

    for position, row in enumerate(work_rows, start=1):
        entry_index = parse_int(row["entry_index"])
        layout_row = layout_by_entry.get(entry_index)
        if layout_row is None:
            print(
                f"[{position}/{len(work_rows)}] Skipping {describe_row(row)} because no layout row was found",
                flush=True,
            )
            continue

        label = describe_row(row)
        print(f"[{position}/{len(work_rows)}] Decoding {label}", flush=True)
        ir = parse_body_ir(row, layout_row, None if variant == "auto" else variant, extracted_root)
        print(f"[{position}/{len(work_rows)}] Rendering {label}", flush=True)
        pseudocode = render_pseudocode(ir)
        shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode))
        for shape_code, suggestion in collect_shape_suggestions_from_pseudocode(pseudocode, row).items():
            suggestion_candidates.setdefault(shape_code, {})[suggestion["human_readable_id"]] = suggestion

    resolved_suggestions: dict[int, dict[str, str]] = {}
    for shape_code, candidates in suggestion_candidates.items():
        if len(candidates) == 1:
            resolved_suggestions[shape_code] = next(iter(candidates.values()))

    return shape_codes, resolved_suggestions


def main() -> None:
    parser = argparse.ArgumentParser(description="Generate or update the append-only USECODE shape catalog CSV")
    parser.add_argument(
        "--extracted-root",
        default=str(EXTRACTED_ROOT),
        help="Extracted USECODE root containing class_event_index.tsv and chunks/",
    )
    parser.add_argument(
        "--output-csv",
        help=(
            "Catalog CSV path "
            "(default: Remorse uses <extracted-root>/usecode_shape_catalog_remorse.csv; "
            "Regret uses <extracted-root>/usecode_shape_catalog_regret.csv)"
        ),
    )
    parser.add_argument(
        "--variant",
        choices=["auto", "regret", "remorse"],
        default="auto",
        help="Crusader intrinsic numbering to apply during scanning (default: auto, fallback regret)",
    )
    args = parser.parse_args()

    extracted_root = Path(args.extracted_root)
    output_csv = Path(args.output_csv) if args.output_csv else default_shape_catalog_path(extracted_root, args.variant)

    print(
        f"Updating shape catalog {output_csv} from extracted_root={extracted_root} using variant={args.variant}",
        flush=True,
    )
    shape_codes, suggested_rows = scan_shape_codes(extracted_root, args.variant)
    fieldnames, rows, existing_codes = load_existing_catalog(output_csv)

    backfilled = 0
    for row in rows:
        shape_code = try_parse_int((row.get("shape_code") or "").strip())
        if shape_code is None:
            continue
        suggestion = suggested_rows.get(shape_code)
        if suggestion is None:
            continue
        if not (row.get("human_readable_id") or "").strip():
            row["human_readable_id"] = suggestion["human_readable_id"]
            backfilled += 1
        if not (row.get("description") or "").strip():
            row["description"] = suggestion["description"]

    missing_codes = sorted(shape_codes - existing_codes)
    for shape_code in missing_codes:
        row = {fieldname: "" for fieldname in fieldnames}
        row["shape_code"] = format_shape_code(shape_code)
        suggestion = suggested_rows.get(shape_code)
        if suggestion is not None:
            row["human_readable_id"] = suggestion["human_readable_id"]
            row["description"] = suggestion["description"]
        rows.append(row)

    write_catalog(output_csv, fieldnames, rows)
    print(
        f"Catalog {output_csv} now contains {len(rows)} rows; "
        f"scanned {len(shape_codes)} distinct shape codes, added {len(missing_codes)} new rows, "
        f"and backfilled {backfilled} existing rows."
    )


if __name__ == "__main__":
    main()
Pseudocode decompialtion improvements and docs 2026-03-26 22:10:48 +01:00			`from __future__ import annotations`

			`import argparse`
			`import csv`
			`import re`
			`import sys`
			`from pathlib import Path`


			`REPO_ROOT = Path(__file__).resolve().parents[1]`
			`if str(REPO_ROOT) not in sys.path:`
			`sys.path.insert(0, str(REPO_ROOT))`


			`from tools.poc_crusader_usecode_parser import (`
			`EXTRACTED_ROOT,`
			`NUMERIC_SHAPE_LITERAL_PATTERN,`
			`SHAPE_CATALOG_FIELDNAMES,`
			`collect_shape_codes_from_pseudocode,`
			`default_shape_catalog_path,`
			`format_shape_code,`
			`load_tsv_rows,`
			`parse_body_ir,`
			`parse_int,`
			`render_pseudocode,`
			`sanitize_identifier,`
			`try_parse_int,`
			`)`


			`SELF_SHAPE_COMPARISON_PATTERN = re.compile(`
			`rf"\bItem\.(?:getShape\|getType)\(\sarg_06\s\)\s(?:==\|!=\|<=\|>=\|<\|>)\s(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"`
			`)`


			`def describe_row(row: dict[str, str]) -> str:`
			`class_name = row.get("class_name_hint") or "unknown"`
			`slot = parse_int(row.get("slot", "0"))`
			`event_name = row.get("event_name_hint") or f"slot_{slot:02X}"`
			`entry_index = row.get("entry_index", "?")`
			`return f"entry {entry_index} {class_name}::${event_name}".replace("::$", "::") + f" (slot 0x{slot:02X})"`


			`def load_layout_by_entry(class_layout_index: Path) -> dict[int, dict[str, str]]:`
			`rows = load_tsv_rows(class_layout_index)`
			`layout_by_entry: dict[int, dict[str, str]] = {}`
			`for row in rows:`
			`entry_index = try_parse_int(row.get("entry_index", ""))`
			`if entry_index is None:`
			`continue`
			`layout_by_entry[entry_index] = row`
			`return layout_by_entry`


			`def load_existing_catalog(csv_path: Path) -> tuple[list[str], list[dict[str, str]], set[int]]:`
			`if not csv_path.exists():`
			`return SHAPE_CATALOG_FIELDNAMES[:], [], set()`

			`with csv_path.open("r", encoding="utf-8", newline="") as handle:`
			`reader = csv.DictReader(handle)`
			`fieldnames = [name for name in (reader.fieldnames or []) if name] or SHAPE_CATALOG_FIELDNAMES[:]`
			`for required_name in SHAPE_CATALOG_FIELDNAMES:`
			`if required_name not in fieldnames:`
			`fieldnames.append(required_name)`

			`rows: list[dict[str, str]] = []`
			`existing_codes: set[int] = set()`
			`for raw_row in reader:`
			`row = {fieldname: raw_row.get(fieldname, "") for fieldname in fieldnames}`
			`shape_code = try_parse_int((row.get("shape_code") or "").strip())`
			`if shape_code is not None:`
			`if shape_code in existing_codes:`
			`continue`
			`row["shape_code"] = format_shape_code(shape_code)`
			`existing_codes.add(shape_code)`
			`rows.append(row)`
			`return fieldnames, rows, existing_codes`


			`def write_catalog(csv_path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:`
			`csv_path.parent.mkdir(parents=True, exist_ok=True)`
			`with csv_path.open("w", encoding="utf-8", newline="") as handle:`
			`writer = csv.DictWriter(handle, fieldnames=fieldnames)`
			`writer.writeheader()`
			`writer.writerows(rows)`


			`def scan_exported_pseudocode_shape_codes(extracted_root: Path) -> set[int]:`
			`pseudocode_root = extracted_root / "pseudocode"`
			`if not pseudocode_root.exists():`
			`print(f"No exported pseudocode directory at {pseudocode_root}; skipping pre-scan", flush=True)`
			`return set()`

			`shape_codes: set[int] = set()`
			`pseudocode_paths = sorted(pseudocode_root.rglob("*.txt"))`
			`print(f"Scanning {len(pseudocode_paths)} exported pseudocode files under {pseudocode_root}", flush=True)`
			`for index, pseudocode_path in enumerate(pseudocode_paths, start=1):`
			`print(f"[pseudocode {index}/{len(pseudocode_paths)}] {pseudocode_path.relative_to(extracted_root).as_posix()}", flush=True)`
			`shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode_path.read_text(encoding="utf-8")))`
			`return shape_codes`


			`def auto_shape_identifier(class_name: str, shape_code: int) -> str:`
			`class_id = sanitize_identifier((class_name or "shape").lower())`
			`return sanitize_identifier(f"{class_id}_shape_{shape_code:04x}")`


			`def collect_shape_suggestions_from_pseudocode(text: str, row: dict[str, str]) -> dict[int, dict[str, str]]:`
			`class_name = (row.get("class_name_hint") or "").strip()`
			`if not class_name:`
			`return {}`

			`suggestions: dict[int, dict[str, str]] = {}`
			`for match in SELF_SHAPE_COMPARISON_PATTERN.finditer(text):`
			`shape_code = try_parse_int(match.group("value"))`
			`if shape_code is None:`
			`continue`
			`suggestions[shape_code] = {`
			`"human_readable_id": auto_shape_identifier(class_name, shape_code),`
			`"description": f"Auto-derived from {class_name} self-shape comparison in USECODE",`
			`}`
			`return suggestions`


			`def scan_shape_codes(extracted_root: Path, variant: str) -> tuple[set[int], dict[int, dict[str, str]]]:`
			`class_event_index = extracted_root / "class_event_index.tsv"`
			`class_layout_index = extracted_root / "class_layout_index.tsv"`
			`rows = load_tsv_rows(class_event_index)`
			`work_rows = [row for row in rows if row.get("derived_body_start") and row.get("derived_body_end")]`
			`layout_by_entry = load_layout_by_entry(class_layout_index)`
			`shape_codes = scan_exported_pseudocode_shape_codes(extracted_root)`
			`suggestion_candidates: dict[int, dict[str, dict[str, str]]] = {}`
			`print(f"Scanning {len(work_rows)} decoded USECODE bodies from {extracted_root}", flush=True)`

			`for position, row in enumerate(work_rows, start=1):`
			`entry_index = parse_int(row["entry_index"])`
			`layout_row = layout_by_entry.get(entry_index)`
			`if layout_row is None:`
			`print(`
			`f"[{position}/{len(work_rows)}] Skipping {describe_row(row)} because no layout row was found",`
			`flush=True,`
			`)`
			`continue`

			`label = describe_row(row)`
			`print(f"[{position}/{len(work_rows)}] Decoding {label}", flush=True)`
			`ir = parse_body_ir(row, layout_row, None if variant == "auto" else variant, extracted_root)`
			`print(f"[{position}/{len(work_rows)}] Rendering {label}", flush=True)`
			`pseudocode = render_pseudocode(ir)`
			`shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode))`
			`for shape_code, suggestion in collect_shape_suggestions_from_pseudocode(pseudocode, row).items():`
			`suggestion_candidates.setdefault(shape_code, {})[suggestion["human_readable_id"]] = suggestion`

			`resolved_suggestions: dict[int, dict[str, str]] = {}`
			`for shape_code, candidates in suggestion_candidates.items():`
			`if len(candidates) == 1:`
			`resolved_suggestions[shape_code] = next(iter(candidates.values()))`

			`return shape_codes, resolved_suggestions`


			`def main() -> None:`
			`parser = argparse.ArgumentParser(description="Generate or update the append-only USECODE shape catalog CSV")`
			`parser.add_argument(`
			`"--extracted-root",`
			`default=str(EXTRACTED_ROOT),`
			`help="Extracted USECODE root containing class_event_index.tsv and chunks/",`
			`)`
			`parser.add_argument(`
			`"--output-csv",`
			`help=(`
			`"Catalog CSV path "`
			`"(default: Remorse uses <extracted-root>/usecode_shape_catalog_remorse.csv; "`
			`"Regret uses <extracted-root>/usecode_shape_catalog_regret.csv)"`
			`),`
			`)`
			`parser.add_argument(`
			`"--variant",`
			`choices=["auto", "regret", "remorse"],`
			`default="auto",`
			`help="Crusader intrinsic numbering to apply during scanning (default: auto, fallback regret)",`
			`)`
			`args = parser.parse_args()`

			`extracted_root = Path(args.extracted_root)`
			`output_csv = Path(args.output_csv) if args.output_csv else default_shape_catalog_path(extracted_root, args.variant)`

			`print(`
			`f"Updating shape catalog {output_csv} from extracted_root={extracted_root} using variant={args.variant}",`
			`flush=True,`
			`)`
			`shape_codes, suggested_rows = scan_shape_codes(extracted_root, args.variant)`
			`fieldnames, rows, existing_codes = load_existing_catalog(output_csv)`

			`backfilled = 0`
			`for row in rows:`
			`shape_code = try_parse_int((row.get("shape_code") or "").strip())`
			`if shape_code is None:`
			`continue`
			`suggestion = suggested_rows.get(shape_code)`
			`if suggestion is None:`
			`continue`
			`if not (row.get("human_readable_id") or "").strip():`
			`row["human_readable_id"] = suggestion["human_readable_id"]`
			`backfilled += 1`
			`if not (row.get("description") or "").strip():`
			`row["description"] = suggestion["description"]`

			`missing_codes = sorted(shape_codes - existing_codes)`
			`for shape_code in missing_codes:`
			`row = {fieldname: "" for fieldname in fieldnames}`
			`row["shape_code"] = format_shape_code(shape_code)`
			`suggestion = suggested_rows.get(shape_code)`
			`if suggestion is not None:`
			`row["human_readable_id"] = suggestion["human_readable_id"]`
			`row["description"] = suggestion["description"]`
			`rows.append(row)`

			`write_catalog(output_csv, fieldnames, rows)`
			`print(`
			`f"Catalog {output_csv} now contains {len(rows)} rows; "`
			`f"scanned {len(shape_codes)} distinct shape codes, added {len(missing_codes)} new rows, "`
			`f"and backfilled {backfilled} existing rows."`
			`)`


			`if __name__ == "__main__":`
			`main()`