228 lines
9.1 KiB
Python
228 lines
9.1 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import csv
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
|
||
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||
|
|
if str(REPO_ROOT) not in sys.path:
|
||
|
|
sys.path.insert(0, str(REPO_ROOT))
|
||
|
|
|
||
|
|
|
||
|
|
from tools.poc_crusader_usecode_parser import (
|
||
|
|
EXTRACTED_ROOT,
|
||
|
|
NUMERIC_SHAPE_LITERAL_PATTERN,
|
||
|
|
SHAPE_CATALOG_FIELDNAMES,
|
||
|
|
collect_shape_codes_from_pseudocode,
|
||
|
|
default_shape_catalog_path,
|
||
|
|
format_shape_code,
|
||
|
|
load_tsv_rows,
|
||
|
|
parse_body_ir,
|
||
|
|
parse_int,
|
||
|
|
render_pseudocode,
|
||
|
|
sanitize_identifier,
|
||
|
|
try_parse_int,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
SELF_SHAPE_COMPARISON_PATTERN = re.compile(
|
||
|
|
rf"\bItem\.(?:getShape|getType)\(\s*arg_06\s*\)\s*(?:==|!=|<=|>=|<|>)\s*(?P<value>{NUMERIC_SHAPE_LITERAL_PATTERN})\b"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def describe_row(row: dict[str, str]) -> str:
|
||
|
|
class_name = row.get("class_name_hint") or "unknown"
|
||
|
|
slot = parse_int(row.get("slot", "0"))
|
||
|
|
event_name = row.get("event_name_hint") or f"slot_{slot:02X}"
|
||
|
|
entry_index = row.get("entry_index", "?")
|
||
|
|
return f"entry {entry_index} {class_name}::${event_name}".replace("::$", "::") + f" (slot 0x{slot:02X})"
|
||
|
|
|
||
|
|
|
||
|
|
def load_layout_by_entry(class_layout_index: Path) -> dict[int, dict[str, str]]:
|
||
|
|
rows = load_tsv_rows(class_layout_index)
|
||
|
|
layout_by_entry: dict[int, dict[str, str]] = {}
|
||
|
|
for row in rows:
|
||
|
|
entry_index = try_parse_int(row.get("entry_index", ""))
|
||
|
|
if entry_index is None:
|
||
|
|
continue
|
||
|
|
layout_by_entry[entry_index] = row
|
||
|
|
return layout_by_entry
|
||
|
|
|
||
|
|
|
||
|
|
def load_existing_catalog(csv_path: Path) -> tuple[list[str], list[dict[str, str]], set[int]]:
|
||
|
|
if not csv_path.exists():
|
||
|
|
return SHAPE_CATALOG_FIELDNAMES[:], [], set()
|
||
|
|
|
||
|
|
with csv_path.open("r", encoding="utf-8", newline="") as handle:
|
||
|
|
reader = csv.DictReader(handle)
|
||
|
|
fieldnames = [name for name in (reader.fieldnames or []) if name] or SHAPE_CATALOG_FIELDNAMES[:]
|
||
|
|
for required_name in SHAPE_CATALOG_FIELDNAMES:
|
||
|
|
if required_name not in fieldnames:
|
||
|
|
fieldnames.append(required_name)
|
||
|
|
|
||
|
|
rows: list[dict[str, str]] = []
|
||
|
|
existing_codes: set[int] = set()
|
||
|
|
for raw_row in reader:
|
||
|
|
row = {fieldname: raw_row.get(fieldname, "") for fieldname in fieldnames}
|
||
|
|
shape_code = try_parse_int((row.get("shape_code") or "").strip())
|
||
|
|
if shape_code is not None:
|
||
|
|
if shape_code in existing_codes:
|
||
|
|
continue
|
||
|
|
row["shape_code"] = format_shape_code(shape_code)
|
||
|
|
existing_codes.add(shape_code)
|
||
|
|
rows.append(row)
|
||
|
|
return fieldnames, rows, existing_codes
|
||
|
|
|
||
|
|
|
||
|
|
def write_catalog(csv_path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
|
||
|
|
csv_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
with csv_path.open("w", encoding="utf-8", newline="") as handle:
|
||
|
|
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
||
|
|
writer.writeheader()
|
||
|
|
writer.writerows(rows)
|
||
|
|
|
||
|
|
|
||
|
|
def scan_exported_pseudocode_shape_codes(extracted_root: Path) -> set[int]:
|
||
|
|
pseudocode_root = extracted_root / "pseudocode"
|
||
|
|
if not pseudocode_root.exists():
|
||
|
|
print(f"No exported pseudocode directory at {pseudocode_root}; skipping pre-scan", flush=True)
|
||
|
|
return set()
|
||
|
|
|
||
|
|
shape_codes: set[int] = set()
|
||
|
|
pseudocode_paths = sorted(pseudocode_root.rglob("*.txt"))
|
||
|
|
print(f"Scanning {len(pseudocode_paths)} exported pseudocode files under {pseudocode_root}", flush=True)
|
||
|
|
for index, pseudocode_path in enumerate(pseudocode_paths, start=1):
|
||
|
|
print(f"[pseudocode {index}/{len(pseudocode_paths)}] {pseudocode_path.relative_to(extracted_root).as_posix()}", flush=True)
|
||
|
|
shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode_path.read_text(encoding="utf-8")))
|
||
|
|
return shape_codes
|
||
|
|
|
||
|
|
|
||
|
|
def auto_shape_identifier(class_name: str, shape_code: int) -> str:
|
||
|
|
class_id = sanitize_identifier((class_name or "shape").lower())
|
||
|
|
return sanitize_identifier(f"{class_id}_shape_{shape_code:04x}")
|
||
|
|
|
||
|
|
|
||
|
|
def collect_shape_suggestions_from_pseudocode(text: str, row: dict[str, str]) -> dict[int, dict[str, str]]:
|
||
|
|
class_name = (row.get("class_name_hint") or "").strip()
|
||
|
|
if not class_name:
|
||
|
|
return {}
|
||
|
|
|
||
|
|
suggestions: dict[int, dict[str, str]] = {}
|
||
|
|
for match in SELF_SHAPE_COMPARISON_PATTERN.finditer(text):
|
||
|
|
shape_code = try_parse_int(match.group("value"))
|
||
|
|
if shape_code is None:
|
||
|
|
continue
|
||
|
|
suggestions[shape_code] = {
|
||
|
|
"human_readable_id": auto_shape_identifier(class_name, shape_code),
|
||
|
|
"description": f"Auto-derived from {class_name} self-shape comparison in USECODE",
|
||
|
|
}
|
||
|
|
return suggestions
|
||
|
|
|
||
|
|
|
||
|
|
def scan_shape_codes(extracted_root: Path, variant: str) -> tuple[set[int], dict[int, dict[str, str]]]:
|
||
|
|
class_event_index = extracted_root / "class_event_index.tsv"
|
||
|
|
class_layout_index = extracted_root / "class_layout_index.tsv"
|
||
|
|
rows = load_tsv_rows(class_event_index)
|
||
|
|
work_rows = [row for row in rows if row.get("derived_body_start") and row.get("derived_body_end")]
|
||
|
|
layout_by_entry = load_layout_by_entry(class_layout_index)
|
||
|
|
shape_codes = scan_exported_pseudocode_shape_codes(extracted_root)
|
||
|
|
suggestion_candidates: dict[int, dict[str, dict[str, str]]] = {}
|
||
|
|
print(f"Scanning {len(work_rows)} decoded USECODE bodies from {extracted_root}", flush=True)
|
||
|
|
|
||
|
|
for position, row in enumerate(work_rows, start=1):
|
||
|
|
entry_index = parse_int(row["entry_index"])
|
||
|
|
layout_row = layout_by_entry.get(entry_index)
|
||
|
|
if layout_row is None:
|
||
|
|
print(
|
||
|
|
f"[{position}/{len(work_rows)}] Skipping {describe_row(row)} because no layout row was found",
|
||
|
|
flush=True,
|
||
|
|
)
|
||
|
|
continue
|
||
|
|
|
||
|
|
label = describe_row(row)
|
||
|
|
print(f"[{position}/{len(work_rows)}] Decoding {label}", flush=True)
|
||
|
|
ir = parse_body_ir(row, layout_row, None if variant == "auto" else variant, extracted_root)
|
||
|
|
print(f"[{position}/{len(work_rows)}] Rendering {label}", flush=True)
|
||
|
|
pseudocode = render_pseudocode(ir)
|
||
|
|
shape_codes.update(collect_shape_codes_from_pseudocode(pseudocode))
|
||
|
|
for shape_code, suggestion in collect_shape_suggestions_from_pseudocode(pseudocode, row).items():
|
||
|
|
suggestion_candidates.setdefault(shape_code, {})[suggestion["human_readable_id"]] = suggestion
|
||
|
|
|
||
|
|
resolved_suggestions: dict[int, dict[str, str]] = {}
|
||
|
|
for shape_code, candidates in suggestion_candidates.items():
|
||
|
|
if len(candidates) == 1:
|
||
|
|
resolved_suggestions[shape_code] = next(iter(candidates.values()))
|
||
|
|
|
||
|
|
return shape_codes, resolved_suggestions
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
parser = argparse.ArgumentParser(description="Generate or update the append-only USECODE shape catalog CSV")
|
||
|
|
parser.add_argument(
|
||
|
|
"--extracted-root",
|
||
|
|
default=str(EXTRACTED_ROOT),
|
||
|
|
help="Extracted USECODE root containing class_event_index.tsv and chunks/",
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--output-csv",
|
||
|
|
help=(
|
||
|
|
"Catalog CSV path "
|
||
|
|
"(default: Remorse uses <extracted-root>/usecode_shape_catalog_remorse.csv; "
|
||
|
|
"Regret uses <extracted-root>/usecode_shape_catalog_regret.csv)"
|
||
|
|
),
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--variant",
|
||
|
|
choices=["auto", "regret", "remorse"],
|
||
|
|
default="auto",
|
||
|
|
help="Crusader intrinsic numbering to apply during scanning (default: auto, fallback regret)",
|
||
|
|
)
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
extracted_root = Path(args.extracted_root)
|
||
|
|
output_csv = Path(args.output_csv) if args.output_csv else default_shape_catalog_path(extracted_root, args.variant)
|
||
|
|
|
||
|
|
print(
|
||
|
|
f"Updating shape catalog {output_csv} from extracted_root={extracted_root} using variant={args.variant}",
|
||
|
|
flush=True,
|
||
|
|
)
|
||
|
|
shape_codes, suggested_rows = scan_shape_codes(extracted_root, args.variant)
|
||
|
|
fieldnames, rows, existing_codes = load_existing_catalog(output_csv)
|
||
|
|
|
||
|
|
backfilled = 0
|
||
|
|
for row in rows:
|
||
|
|
shape_code = try_parse_int((row.get("shape_code") or "").strip())
|
||
|
|
if shape_code is None:
|
||
|
|
continue
|
||
|
|
suggestion = suggested_rows.get(shape_code)
|
||
|
|
if suggestion is None:
|
||
|
|
continue
|
||
|
|
if not (row.get("human_readable_id") or "").strip():
|
||
|
|
row["human_readable_id"] = suggestion["human_readable_id"]
|
||
|
|
backfilled += 1
|
||
|
|
if not (row.get("description") or "").strip():
|
||
|
|
row["description"] = suggestion["description"]
|
||
|
|
|
||
|
|
missing_codes = sorted(shape_codes - existing_codes)
|
||
|
|
for shape_code in missing_codes:
|
||
|
|
row = {fieldname: "" for fieldname in fieldnames}
|
||
|
|
row["shape_code"] = format_shape_code(shape_code)
|
||
|
|
suggestion = suggested_rows.get(shape_code)
|
||
|
|
if suggestion is not None:
|
||
|
|
row["human_readable_id"] = suggestion["human_readable_id"]
|
||
|
|
row["description"] = suggestion["description"]
|
||
|
|
rows.append(row)
|
||
|
|
|
||
|
|
write_catalog(output_csv, fieldnames, rows)
|
||
|
|
print(
|
||
|
|
f"Catalog {output_csv} now contains {len(rows)} rows; "
|
||
|
|
f"scanned {len(shape_codes)} distinct shape codes, added {len(missing_codes)} new rows, "
|
||
|
|
f"and backfilled {backfilled} existing rows."
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|