Add detailed class event processing and family comparison tools
- Enhance `extract_eusecode_flx.py` to derive class event rows with additional metadata including derived body windows and repeated template statuses. - Introduce `usecode_family_compare.py` for comparing event families, analyzing commonalities in event bodies, and generating reports on identical groups and differences. - Implement new data structures for managing class event rows and family artifact specifications. - Update output formats to include derived body information and repeated family regression checks. - Ensure robust validation of repeated family expectations against actual extracted data.
This commit is contained in:
parent
de42fd1ea1
commit
4d3c8cd81b
23 changed files with 15033 additions and 14221 deletions
|
|
@ -15,6 +15,7 @@ to support the next decoding pass.
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import pathlib
|
||||
import struct
|
||||
|
|
@ -61,6 +62,21 @@ class ExtractedChunk:
|
|||
class_parse_status: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ClassEventRow:
|
||||
entry_index: int
|
||||
object_index: int
|
||||
class_id: int
|
||||
class_name_hint: str
|
||||
slot: int
|
||||
event_name_hint: str | None
|
||||
raw_event_entry_word: int
|
||||
raw_code_offset: int
|
||||
derived_body_start: int | None
|
||||
derived_body_end: int | None
|
||||
derived_body_length: int | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FlxTable:
|
||||
entry_count: int
|
||||
|
|
@ -69,6 +85,25 @@ class FlxTable:
|
|||
entries: list[CandidateEntry]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FamilyArtifactSpec:
|
||||
output_stem: str
|
||||
title: str
|
||||
labels: tuple[str, ...]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RepeatedFamilyRowExpectation:
|
||||
class_name: str
|
||||
slot: int
|
||||
raw_event_entry_word: int
|
||||
raw_code_offset: int
|
||||
derived_body_start: int
|
||||
derived_body_end: int
|
||||
derived_body_length: int
|
||||
repeated_template_status: str
|
||||
|
||||
|
||||
def read_u32_le(data: bytes, offset: int) -> int:
|
||||
return struct.unpack_from("<I", data, offset)[0]
|
||||
|
||||
|
|
@ -454,6 +489,73 @@ SCUMMVM_EVENT_NAME_HINTS: tuple[str, ...] = (
|
|||
)
|
||||
|
||||
|
||||
VERIFIED_REPEATED_TEMPLATE_FAMILIES: tuple[tuple[str, tuple[str, ...]], ...] = (
|
||||
("referent-anchor-twin", ("JELYHACK", "JELYH2")),
|
||||
("boot-event-core", ("AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT")),
|
||||
("callback-eventtrigger", ("SURCAMNS", "SURCAMEW")),
|
||||
("environmental-event", ("FLAMEBOX", "NOSTRIL", "STEAMBOX")),
|
||||
)
|
||||
|
||||
|
||||
FAMILY_ARTIFACT_SPECS: tuple[FamilyArtifactSpec, ...] = (
|
||||
FamilyArtifactSpec(
|
||||
output_stem="boot_family_decompile",
|
||||
title="_BOOT Family Decompiled Event Sketches",
|
||||
labels=("AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT"),
|
||||
),
|
||||
FamilyArtifactSpec(
|
||||
output_stem="callback_family_decompile",
|
||||
title="SURCAM Callback Family Decompiled Event Sketches",
|
||||
labels=("SURCAMNS", "SURCAMEW"),
|
||||
),
|
||||
FamilyArtifactSpec(
|
||||
output_stem="environmental_family_decompile",
|
||||
title="Environmental Family Decompiled Event Sketches",
|
||||
labels=("FLAMEBOX", "NOSTRIL", "STEAMBOX"),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS: tuple[RepeatedFamilyRowExpectation, ...] = (
|
||||
RepeatedFamilyRowExpectation("JELYHACK", 0x01, 0x002A, 0x00000001, 0x00D4, 0x00FE, 42, "referent-anchor-twin/shared-slot-0x01/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("JELYH2", 0x01, 0x002A, 0x00000001, 0x00D4, 0x00FE, 42, "referent-anchor-twin/shared-slot-0x01/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("AND_BOOT", 0x0A, 0x0253, 0x00000001, 0x00D4, 0x0327, 595, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("AND_BOOT", 0x0F, 0x0237, 0x00000254, 0x0327, 0x055E, 567, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("AND_BOOT", 0x10, 0x003B, 0x0000048B, 0x055E, 0x0599, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("BRO_BOOT", 0x0A, 0x02D5, 0x00000001, 0x00D4, 0x03A9, 725, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("BRO_BOOT", 0x0F, 0x024C, 0x000002D6, 0x03A9, 0x05F5, 588, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("BRO_BOOT", 0x10, 0x003B, 0x00000522, 0x05F5, 0x0630, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("COR_BOOT", 0x0A, 0x0227, 0x00000001, 0x00D4, 0x02FB, 551, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("COR_BOOT", 0x0F, 0x0234, 0x00000228, 0x02FB, 0x052F, 564, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("COR_BOOT", 0x10, 0x003B, 0x0000045C, 0x052F, 0x056A, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("REE_BOOT", 0x0A, 0x034B, 0x00000001, 0x00D4, 0x041F, 843, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("REE_BOOT", 0x0F, 0x025C, 0x0000034C, 0x041F, 0x067B, 604, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("REE_BOOT", 0x10, 0x003B, 0x000005A8, 0x067B, 0x06B6, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("VAR_BOOT", 0x0A, 0x029A, 0x00000001, 0x00D4, 0x036E, 666, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("VAR_BOOT", 0x0F, 0x0244, 0x0000029B, 0x036E, 0x05B2, 580, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("VAR_BOOT", 0x10, 0x003B, 0x000004DF, 0x05B2, 0x05ED, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMNS", 0x01, 0x0051, 0x000000D2, 0x01B7, 0x0208, 81, "callback-eventtrigger/shared-slot-0x01/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMNS", 0x0A, 0x00D1, 0x00000001, 0x00E6, 0x01B7, 209, "callback-eventtrigger/shared-slot-0x0A/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMNS", 0x20, 0x02BA, 0x00000123, 0x0208, 0x04C2, 698, "callback-eventtrigger/shared-slot-0x20/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMNS", 0x21, 0x0709, 0x000003DD, 0x04C2, 0x0BCB, 1801, "callback-eventtrigger/shared-slot-0x21/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMNS", 0x22, 0x01A3, 0x00000AE6, 0x0BCB, 0x0D6E, 419, "callback-eventtrigger/shared-slot-0x22/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMEW", 0x01, 0x00F7, 0x000000D2, 0x01B7, 0x02AE, 247, "callback-eventtrigger/shared-slot-0x01/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMEW", 0x0A, 0x00D1, 0x00000001, 0x00E6, 0x01B7, 209, "callback-eventtrigger/shared-slot-0x0A/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMEW", 0x20, 0x02BA, 0x000001C9, 0x02AE, 0x0568, 698, "callback-eventtrigger/shared-slot-0x20/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMEW", 0x21, 0x0655, 0x00000483, 0x0568, 0x0BBD, 1621, "callback-eventtrigger/shared-slot-0x21/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("SURCAMEW", 0x22, 0x01A3, 0x00000AD8, 0x0BBD, 0x0D60, 419, "callback-eventtrigger/shared-slot-0x22/same-length-template"),
|
||||
RepeatedFamilyRowExpectation("FLAMEBOX", 0x0A, 0x026A, 0x00000001, 0x00E0, 0x034A, 618, "environmental-event/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("FLAMEBOX", 0x20, 0x01AC, 0x0000026B, 0x034A, 0x04F6, 428, "environmental-event/shared-slot-0x20/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("FLAMEBOX", 0x21, 0x029A, 0x00000417, 0x04F6, 0x0790, 666, "environmental-event/shared-slot-0x21/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("NOSTRIL", 0x0A, 0x00C0, 0x00000001, 0x00E0, 0x01A0, 192, "environmental-event/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("NOSTRIL", 0x20, 0x0129, 0x000000C1, 0x01A0, 0x02C9, 297, "environmental-event/shared-slot-0x20/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("NOSTRIL", 0x21, 0x01BE, 0x000001EA, 0x02C9, 0x0487, 446, "environmental-event/shared-slot-0x21/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("STEAMBOX", 0x0A, 0x0266, 0x00000001, 0x00E0, 0x0346, 614, "environmental-event/shared-slot-0x0A/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("STEAMBOX", 0x20, 0x01F6, 0x00000267, 0x0346, 0x053C, 502, "environmental-event/shared-slot-0x20/shared-slot-template"),
|
||||
RepeatedFamilyRowExpectation("STEAMBOX", 0x21, 0x02A7, 0x0000045D, 0x053C, 0x07E3, 679, "environmental-event/shared-slot-0x21/shared-slot-template"),
|
||||
)
|
||||
|
||||
|
||||
def scummvm_event_name_hint(slot: int) -> str | None:
|
||||
if 0 <= slot < len(SCUMMVM_EVENT_NAME_HINTS):
|
||||
return SCUMMVM_EVENT_NAME_HINTS[slot]
|
||||
|
|
@ -532,6 +634,368 @@ def annotate_class_layout(chunks: list[ExtractedChunk]) -> None:
|
|||
chunk.class_parse_status = "parsed-class-layout"
|
||||
|
||||
|
||||
def derive_class_event_rows(chunk: ExtractedChunk, raw_data: bytes) -> list[ClassEventRow]:
|
||||
if chunk.class_parse_status != "parsed-class-layout":
|
||||
return []
|
||||
if chunk.object_index is None or chunk.class_id is None or chunk.conservative_event_count is None:
|
||||
return []
|
||||
|
||||
provisional_rows: list[tuple[int, int, int]] = []
|
||||
for slot in range(chunk.conservative_event_count):
|
||||
entry_offset = 20 + 6 * slot
|
||||
raw_word = read_u16_le(raw_data, entry_offset)
|
||||
raw_code_offset = read_u32_le(raw_data, entry_offset + 2)
|
||||
provisional_rows.append((slot, raw_word, raw_code_offset))
|
||||
|
||||
non_zero_offsets = sorted(
|
||||
{
|
||||
raw_code_offset
|
||||
for _, _, raw_code_offset in provisional_rows
|
||||
if raw_code_offset != 0
|
||||
}
|
||||
)
|
||||
rows: list[ClassEventRow] = []
|
||||
|
||||
for slot, raw_word, raw_code_offset in provisional_rows:
|
||||
derived_body_start: int | None = None
|
||||
derived_body_end: int | None = None
|
||||
derived_body_length: int | None = None
|
||||
|
||||
if raw_code_offset != 0 and chunk.code_base_minus_one is not None:
|
||||
body_start = chunk.code_base_minus_one + raw_code_offset
|
||||
next_offsets = [offset for offset in non_zero_offsets if offset > raw_code_offset]
|
||||
body_end = chunk.code_base_minus_one + next_offsets[0] if next_offsets else len(raw_data)
|
||||
if 0 <= body_start <= body_end <= len(raw_data):
|
||||
derived_body_start = body_start
|
||||
derived_body_end = body_end
|
||||
derived_body_length = body_end - body_start
|
||||
|
||||
rows.append(
|
||||
ClassEventRow(
|
||||
entry_index=chunk.index,
|
||||
object_index=chunk.object_index,
|
||||
class_id=chunk.class_id,
|
||||
class_name_hint=chunk.class_name_hint or "",
|
||||
slot=slot,
|
||||
event_name_hint=scummvm_event_name_hint(slot),
|
||||
raw_event_entry_word=raw_word,
|
||||
raw_code_offset=raw_code_offset,
|
||||
derived_body_start=derived_body_start,
|
||||
derived_body_end=derived_body_end,
|
||||
derived_body_length=derived_body_length,
|
||||
)
|
||||
)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def build_class_event_rows(
|
||||
parsed_class_chunks: list[ExtractedChunk],
|
||||
) -> tuple[list[ClassEventRow], dict[int, list[ClassEventRow]], dict[int, bytes]]:
|
||||
all_rows: list[ClassEventRow] = []
|
||||
rows_by_entry: dict[int, list[ClassEventRow]] = {}
|
||||
raw_data_by_entry: dict[int, bytes] = {}
|
||||
|
||||
for chunk in parsed_class_chunks:
|
||||
raw_data = pathlib.Path(chunk.raw_path).read_bytes()
|
||||
raw_data_by_entry[chunk.index] = raw_data
|
||||
rows = derive_class_event_rows(chunk, raw_data)
|
||||
rows_by_entry[chunk.index] = rows
|
||||
all_rows.extend(rows)
|
||||
|
||||
return all_rows, rows_by_entry, raw_data_by_entry
|
||||
|
||||
|
||||
def build_repeated_template_status_map(
|
||||
parsed_class_chunks: list[ExtractedChunk],
|
||||
rows_by_entry: dict[int, list[ClassEventRow]],
|
||||
raw_data_by_entry: dict[int, bytes],
|
||||
) -> dict[tuple[int, int], str]:
|
||||
status_by_row: dict[tuple[int, int], str] = {}
|
||||
chunk_by_label = {
|
||||
chunk.primary_label: chunk
|
||||
for chunk in parsed_class_chunks
|
||||
if chunk.primary_label
|
||||
}
|
||||
|
||||
for family_name, labels in VERIFIED_REPEATED_TEMPLATE_FAMILIES:
|
||||
family_chunks = [chunk_by_label[label] for label in labels if label in chunk_by_label]
|
||||
if len(family_chunks) < 2:
|
||||
continue
|
||||
|
||||
rows_by_slot: dict[int, list[tuple[ExtractedChunk, ClassEventRow, bytes]]] = {}
|
||||
for chunk in family_chunks:
|
||||
raw_data = raw_data_by_entry.get(chunk.index)
|
||||
if raw_data is None:
|
||||
continue
|
||||
for row in rows_by_entry.get(chunk.index, []):
|
||||
if row.raw_code_offset == 0:
|
||||
continue
|
||||
if row.derived_body_start is None or row.derived_body_end is None:
|
||||
continue
|
||||
body = raw_data[row.derived_body_start:row.derived_body_end]
|
||||
rows_by_slot.setdefault(row.slot, []).append((chunk, row, body))
|
||||
|
||||
for slot, slot_rows in rows_by_slot.items():
|
||||
if len(slot_rows) < 2:
|
||||
continue
|
||||
|
||||
lengths = {len(body) for _, _, body in slot_rows}
|
||||
bodies = {body for _, _, body in slot_rows}
|
||||
if len(bodies) == 1:
|
||||
status_suffix = "exact-body-clone"
|
||||
elif len(lengths) == 1:
|
||||
status_suffix = "same-length-template"
|
||||
else:
|
||||
status_suffix = "shared-slot-template"
|
||||
|
||||
status = f"{family_name}/shared-slot-0x{slot:02X}/{status_suffix}"
|
||||
for chunk, row, _ in slot_rows:
|
||||
status_by_row[(chunk.index, row.slot)] = status
|
||||
|
||||
return status_by_row
|
||||
|
||||
|
||||
def format_optional_hex(value: int | None, width: int = 0) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
if width > 0:
|
||||
return f"0x{value:0{width}X}"
|
||||
return f"0x{value:X}"
|
||||
|
||||
|
||||
def hex_edge(data: bytes, width: int = 8) -> str:
|
||||
if not data:
|
||||
return ""
|
||||
return data[:width].hex()
|
||||
|
||||
|
||||
def hex_tail(data: bytes, width: int = 8) -> str:
|
||||
if not data:
|
||||
return ""
|
||||
return data[-width:].hex()
|
||||
|
||||
|
||||
def write_family_decompile_artifact(
|
||||
out_dir: pathlib.Path,
|
||||
parsed_class_chunks: list[ExtractedChunk],
|
||||
rows_by_entry: dict[int, list[ClassEventRow]],
|
||||
raw_data_by_entry: dict[int, bytes],
|
||||
repeated_status_by_row: dict[tuple[int, int], str],
|
||||
spec: FamilyArtifactSpec,
|
||||
) -> None:
|
||||
family_labels = set(spec.labels)
|
||||
family_chunks = [chunk for chunk in parsed_class_chunks if chunk.primary_label in family_labels]
|
||||
if not family_chunks:
|
||||
return
|
||||
|
||||
family_chunks.sort(key=lambda chunk: chunk.primary_label or "")
|
||||
|
||||
tsv_lines = [
|
||||
"entry_index\tclass_id\tclass_name\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset\tderived_body_start\tderived_body_end\tderived_body_length\trepeated_template_status\tbody_sha1\tbody_prefix_hex\tbody_suffix_hex"
|
||||
]
|
||||
md_lines = [
|
||||
f"# {spec.title}",
|
||||
"",
|
||||
"This is a reversible per-class rendering derived directly from `class_event_index.tsv` plus the raw extracted chunk bytes.",
|
||||
"ScummVM event labels remain hints only; the authoritative data here is the slot id, raw row bytes, and derived body window.",
|
||||
"",
|
||||
]
|
||||
|
||||
for chunk in family_chunks:
|
||||
rows = [row for row in rows_by_entry.get(chunk.index, []) if row.raw_code_offset != 0]
|
||||
if not rows:
|
||||
continue
|
||||
raw_data = raw_data_by_entry[chunk.index]
|
||||
|
||||
md_lines.extend([
|
||||
f"## {chunk.primary_label}",
|
||||
"",
|
||||
"```yaml",
|
||||
"class:",
|
||||
f" entry_index: 0x{chunk.index:03X}",
|
||||
f" class_id: 0x{chunk.class_id:X}",
|
||||
f" class_name: {chunk.primary_label}",
|
||||
f" class_object_index: 0x{chunk.object_index:X}",
|
||||
f" raw_code_base_u32: 0x{chunk.raw_code_base_u32:X}",
|
||||
f" code_base_minus_one: 0x{chunk.code_base_minus_one:X}",
|
||||
f" conservative_event_count: {chunk.conservative_event_count}",
|
||||
" events:",
|
||||
])
|
||||
|
||||
for row in rows:
|
||||
body = b""
|
||||
if row.derived_body_start is not None and row.derived_body_end is not None:
|
||||
body = raw_data[row.derived_body_start:row.derived_body_end]
|
||||
repeated_status = repeated_status_by_row.get((row.entry_index, row.slot), "")
|
||||
body_sha1 = hashlib.sha1(body).hexdigest() if body else ""
|
||||
|
||||
md_lines.extend([
|
||||
f" - slot: 0x{row.slot:02x}",
|
||||
f" event_name_hint: {row.event_name_hint or ''}",
|
||||
f" raw_event_entry_word: 0x{row.raw_event_entry_word:04x}",
|
||||
f" raw_code_offset: 0x{row.raw_code_offset:08x}",
|
||||
f" derived_body_start: {format_optional_hex(row.derived_body_start, 4).lower() or 'null'}",
|
||||
f" derived_body_end: {format_optional_hex(row.derived_body_end, 4).lower() or 'null'}",
|
||||
f" derived_body_length: {row.derived_body_length if row.derived_body_length is not None else 'null'}",
|
||||
f" repeated_template_status: {repeated_status or 'unique-or-unclassified'}",
|
||||
f" body_sha1: {body_sha1 or 'null'}",
|
||||
f" body_prefix_hex: {hex_edge(body) or 'null'}",
|
||||
f" body_suffix_hex: {hex_tail(body) or 'null'}",
|
||||
])
|
||||
|
||||
tsv_lines.append(
|
||||
"{entry_index}\t0x{class_id:X}\t{class_name}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_event_entry_word:04X}\t0x{raw_code_offset:08X}\t{derived_body_start}\t{derived_body_end}\t{derived_body_length}\t{repeated_template_status}\t{body_sha1}\t{body_prefix_hex}\t{body_suffix_hex}".format(
|
||||
entry_index=row.entry_index,
|
||||
class_id=row.class_id,
|
||||
class_name=chunk.primary_label or "",
|
||||
slot=row.slot,
|
||||
event_name_hint=row.event_name_hint or "",
|
||||
raw_event_entry_word=row.raw_event_entry_word,
|
||||
raw_code_offset=row.raw_code_offset,
|
||||
derived_body_start=format_optional_hex(row.derived_body_start, 4),
|
||||
derived_body_end=format_optional_hex(row.derived_body_end, 4),
|
||||
derived_body_length=(row.derived_body_length if row.derived_body_length is not None else ""),
|
||||
repeated_template_status=repeated_status,
|
||||
body_sha1=body_sha1,
|
||||
body_prefix_hex=hex_edge(body),
|
||||
body_suffix_hex=hex_tail(body),
|
||||
)
|
||||
)
|
||||
|
||||
md_lines.extend([
|
||||
"```",
|
||||
"",
|
||||
])
|
||||
|
||||
(out_dir / f"{spec.output_stem}.md").write_text("\n".join(md_lines), encoding="utf-8")
|
||||
(out_dir / f"{spec.output_stem}.tsv").write_text("\n".join(tsv_lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def validate_verified_repeated_family_regressions(
|
||||
parsed_class_chunks: list[ExtractedChunk],
|
||||
rows_by_entry: dict[int, list[ClassEventRow]],
|
||||
repeated_status_by_row: dict[tuple[int, int], str],
|
||||
) -> list[str]:
|
||||
chunk_by_label = {
|
||||
chunk.primary_label: chunk
|
||||
for chunk in parsed_class_chunks
|
||||
if chunk.primary_label
|
||||
}
|
||||
expected_slots_by_class: dict[str, set[int]] = {}
|
||||
for expectation in VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS:
|
||||
expected_slots_by_class.setdefault(expectation.class_name, set()).add(expectation.slot)
|
||||
|
||||
report_lines = [
|
||||
"record_type\tclass_name\tslot\texpected\tactual\tstatus"
|
||||
]
|
||||
errors: list[str] = []
|
||||
|
||||
for class_name, expected_slots in sorted(expected_slots_by_class.items()):
|
||||
chunk = chunk_by_label.get(class_name)
|
||||
actual_slots: set[int] = set()
|
||||
if chunk is not None:
|
||||
actual_slots = {
|
||||
row.slot
|
||||
for row in rows_by_entry.get(chunk.index, [])
|
||||
if row.raw_code_offset != 0
|
||||
}
|
||||
status = "ok" if actual_slots == expected_slots else "mismatch"
|
||||
report_lines.append(
|
||||
"slot-set\t{class_name}\t*\t{expected}\t{actual}\t{status}".format(
|
||||
class_name=class_name,
|
||||
expected=",".join(f"0x{slot:02X}" for slot in sorted(expected_slots)),
|
||||
actual=",".join(f"0x{slot:02X}" for slot in sorted(actual_slots)),
|
||||
status=status,
|
||||
)
|
||||
)
|
||||
if status != "ok":
|
||||
errors.append(
|
||||
f"{class_name}: expected non-zero slots {sorted(expected_slots)}, found {sorted(actual_slots)}"
|
||||
)
|
||||
|
||||
for expectation in VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS:
|
||||
chunk = chunk_by_label.get(expectation.class_name)
|
||||
if chunk is None:
|
||||
errors.append(f"missing repeated-family class {expectation.class_name}")
|
||||
report_lines.append(
|
||||
f"row\t{expectation.class_name}\t0x{expectation.slot:02X}\tpresent\tmissing-class\tmismatch"
|
||||
)
|
||||
continue
|
||||
|
||||
row = next(
|
||||
(candidate for candidate in rows_by_entry.get(chunk.index, []) if candidate.slot == expectation.slot),
|
||||
None,
|
||||
)
|
||||
if row is None:
|
||||
errors.append(f"missing row {expectation.class_name} slot 0x{expectation.slot:02X}")
|
||||
report_lines.append(
|
||||
f"row\t{expectation.class_name}\t0x{expectation.slot:02X}\tpresent\tmissing-row\tmismatch"
|
||||
)
|
||||
continue
|
||||
|
||||
actual_values = (
|
||||
row.raw_event_entry_word,
|
||||
row.raw_code_offset,
|
||||
row.derived_body_start,
|
||||
row.derived_body_end,
|
||||
row.derived_body_length,
|
||||
repeated_status_by_row.get((row.entry_index, row.slot), ""),
|
||||
)
|
||||
expected_values = (
|
||||
expectation.raw_event_entry_word,
|
||||
expectation.raw_code_offset,
|
||||
expectation.derived_body_start,
|
||||
expectation.derived_body_end,
|
||||
expectation.derived_body_length,
|
||||
expectation.repeated_template_status,
|
||||
)
|
||||
status = "ok" if actual_values == expected_values else "mismatch"
|
||||
report_lines.append(
|
||||
"row\t{class_name}\t0x{slot:02X}\t{expected}\t{actual}\t{status}".format(
|
||||
class_name=expectation.class_name,
|
||||
slot=expectation.slot,
|
||||
expected="|".join(
|
||||
[
|
||||
f"0x{expectation.raw_event_entry_word:04X}",
|
||||
f"0x{expectation.raw_code_offset:08X}",
|
||||
f"0x{expectation.derived_body_start:04X}",
|
||||
f"0x{expectation.derived_body_end:04X}",
|
||||
str(expectation.derived_body_length),
|
||||
expectation.repeated_template_status,
|
||||
]
|
||||
),
|
||||
actual="|".join(
|
||||
[
|
||||
f"0x{row.raw_event_entry_word:04X}",
|
||||
f"0x{row.raw_code_offset:08X}",
|
||||
format_optional_hex(row.derived_body_start, 4),
|
||||
format_optional_hex(row.derived_body_end, 4),
|
||||
str(row.derived_body_length if row.derived_body_length is not None else ""),
|
||||
repeated_status_by_row.get((row.entry_index, row.slot), ""),
|
||||
]
|
||||
),
|
||||
status=status,
|
||||
)
|
||||
)
|
||||
if status != "ok":
|
||||
errors.append(
|
||||
"{class_name} slot 0x{slot:02X}: expected {expected}, found {actual}".format(
|
||||
class_name=expectation.class_name,
|
||||
slot=expectation.slot,
|
||||
expected=expected_values,
|
||||
actual=actual_values,
|
||||
)
|
||||
)
|
||||
|
||||
if errors:
|
||||
raise ValueError(
|
||||
"repeated-family regression mismatch:\n- " + "\n- ".join(errors)
|
||||
)
|
||||
|
||||
return report_lines
|
||||
|
||||
|
||||
def readable_neighbor_chunks(
|
||||
center: ExtractedChunk,
|
||||
chunk_by_index: dict[int, ExtractedChunk],
|
||||
|
|
@ -1556,6 +2020,17 @@ def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes,
|
|||
"entry_index\tobject_index\tclass_id\tclass_name_hint\traw_code_base_u32\tcode_base_minus_one\tconservative_event_count\tevent_table_end\tclass_parse_status\tdata_offset\tdeclared_size\tprimary_label"
|
||||
]
|
||||
parsed_class_chunks = [chunk for chunk in chunks if chunk.class_parse_status == "parsed-class-layout"]
|
||||
class_event_rows, rows_by_entry, raw_data_by_entry = build_class_event_rows(parsed_class_chunks)
|
||||
repeated_status_by_row = build_repeated_template_status_map(
|
||||
parsed_class_chunks,
|
||||
rows_by_entry,
|
||||
raw_data_by_entry,
|
||||
)
|
||||
repeated_family_regression_lines = validate_verified_repeated_family_regressions(
|
||||
parsed_class_chunks,
|
||||
rows_by_entry,
|
||||
repeated_status_by_row,
|
||||
)
|
||||
for chunk in parsed_class_chunks:
|
||||
class_layout_lines.append(
|
||||
"{index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{raw_code_base_u32:X}\t0x{code_base_minus_one:X}\t{conservative_event_count}\t0x{event_table_end:X}\t{class_parse_status}\t0x{data_offset:X}\t0x{declared_size:X}\t{primary_label}".format(
|
||||
|
|
@ -1576,28 +2051,39 @@ def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes,
|
|||
(out_dir / "class_layout_index.tsv").write_text("\n".join(class_layout_lines) + "\n", encoding="utf-8")
|
||||
|
||||
class_event_lines = [
|
||||
"entry_index\tobject_index\tclass_id\tclass_name_hint\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset"
|
||||
"entry_index\tobject_index\tclass_id\tclass_name_hint\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset\tderived_body_start\tderived_body_end\tderived_body_length\trepeated_template_status"
|
||||
]
|
||||
for chunk in parsed_class_chunks:
|
||||
raw_data = pathlib.Path(chunk.raw_path).read_bytes()
|
||||
assert chunk.conservative_event_count is not None
|
||||
for slot in range(chunk.conservative_event_count):
|
||||
entry_offset = 20 + 6 * slot
|
||||
raw_word = read_u16_le(raw_data, entry_offset)
|
||||
raw_code_offset = read_u32_le(raw_data, entry_offset + 2)
|
||||
class_event_lines.append(
|
||||
"{entry_index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_word:04X}\t0x{raw_code_offset:08X}".format(
|
||||
entry_index=chunk.index,
|
||||
object_index=chunk.object_index,
|
||||
class_id=chunk.class_id,
|
||||
class_name_hint=chunk.class_name_hint or "",
|
||||
slot=slot,
|
||||
event_name_hint=scummvm_event_name_hint(slot) or "",
|
||||
raw_word=raw_word,
|
||||
raw_code_offset=raw_code_offset,
|
||||
)
|
||||
for row in class_event_rows:
|
||||
class_event_lines.append(
|
||||
"{entry_index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_event_entry_word:04X}\t0x{raw_code_offset:08X}\t{derived_body_start}\t{derived_body_end}\t{derived_body_length}\t{repeated_template_status}".format(
|
||||
entry_index=row.entry_index,
|
||||
object_index=row.object_index,
|
||||
class_id=row.class_id,
|
||||
class_name_hint=row.class_name_hint,
|
||||
slot=row.slot,
|
||||
event_name_hint=row.event_name_hint or "",
|
||||
raw_event_entry_word=row.raw_event_entry_word,
|
||||
raw_code_offset=row.raw_code_offset,
|
||||
derived_body_start=format_optional_hex(row.derived_body_start, 4),
|
||||
derived_body_end=format_optional_hex(row.derived_body_end, 4),
|
||||
derived_body_length=(row.derived_body_length if row.derived_body_length is not None else ""),
|
||||
repeated_template_status=repeated_status_by_row.get((row.entry_index, row.slot), ""),
|
||||
)
|
||||
)
|
||||
(out_dir / "class_event_index.tsv").write_text("\n".join(class_event_lines) + "\n", encoding="utf-8")
|
||||
for family_artifact_spec in FAMILY_ARTIFACT_SPECS:
|
||||
write_family_decompile_artifact(
|
||||
out_dir,
|
||||
parsed_class_chunks,
|
||||
rows_by_entry,
|
||||
raw_data_by_entry,
|
||||
repeated_status_by_row,
|
||||
family_artifact_spec,
|
||||
)
|
||||
(out_dir / "repeated_family_regressions.tsv").write_text(
|
||||
"\n".join(repeated_family_regression_lines) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
neighborhood_lines = [
|
||||
"center_index\tneighbor_index\tprimary_label\tfield_names\tfield_tags"
|
||||
|
|
@ -1763,7 +2249,9 @@ def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes,
|
|||
lines.append("- `.strings.txt` files are the main human-readable output for now; `.txt` files are emitted only for chunks that look text-like.")
|
||||
lines.append("- `descriptor_index.tsv` summarizes guessed class labels, field names, and compact tag patterns for descriptor-like chunks.")
|
||||
lines.append("- `class_layout_index.tsv` records the conservative owner-loaded class parsing state: object index, class id, class-name hint, raw bytes-8..11 field, derived code-base-minus-one, and event-count/table-end values when the local divisibility and bounds checks succeed.")
|
||||
lines.append("- `class_event_index.tsv` expands parsed owner-loaded classes into raw 6-byte event rows with slot numbers, ScummVM event-name hints for `0x00..0x1f`, unresolved leading words, and raw code-offset dwords for round-trip tooling work.")
|
||||
lines.append("- `class_event_index.tsv` now also emits derived body-window columns (`derived_body_start`, `derived_body_end`, `derived_body_length`) plus conservative `repeated_template_status` tags for verified repeated families.")
|
||||
lines.append("- `boot_family_decompile.md` / `.tsv`, `callback_family_decompile.md` / `.tsv`, and `environmental_family_decompile.md` / `.tsv` now provide reversible per-class decompile artifacts for the `_BOOT`, `SURCAM*`, and environmental repeated-family lanes.")
|
||||
lines.append("- `repeated_family_regressions.tsv` enforces the current repeated-family slot sets plus the verified raw-row and derived body-window fields for `JELYHACK/JELYH2`, `_BOOT`, `SURCAM*`, and `FLAMEBOX/NOSTRIL/STEAMBOX`.")
|
||||
lines.append("- `descriptor_neighborhoods.tsv` captures local table neighborhoods around trigger/event-related classes such as `JELYHACK`, `NPCTRIG`, `CRUZTRIG`, `TRIGPAD`, and `SPECIAL`.")
|
||||
lines.append("- `referent_anchor_event_graph.tsv` groups referent-bearing descriptors with nearby event-bearing neighbors so the attachment model can be inspected without ad hoc grepping.")
|
||||
lines.append("- `jelyhack_island_graph.md` now uses a wider local window so the `JELYHACK` / `JELYH2` anchors can be inspected alongside the nearby event-bearing `REE_BOOT`, `SURCAMEW`, and `SFXTRIG` descriptors rather than stopping at the referent-only neighbors.")
|
||||
|
|
|
|||
165
tools/usecode_family_compare.py
Normal file
165
tools/usecode_family_compare.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
import csv
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
LAYOUT_PATH = ROOT / "USECODE" / "EUSECODE_extracted" / "class_layout_index.tsv"
|
||||
EVENT_PATH = ROOT / "USECODE" / "EUSECODE_extracted" / "class_event_index.tsv"
|
||||
CHUNKS_DIR = ROOT / "USECODE" / "EUSECODE_extracted" / "chunks"
|
||||
|
||||
FAMILIES = {
|
||||
"BOOT": {
|
||||
"classes": ["AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT"],
|
||||
"slots": [0x0A, 0x0F, 0x10],
|
||||
},
|
||||
"SURCAM": {
|
||||
"classes": ["SURCAMNS", "SURCAMEW"],
|
||||
"slots": [0x20, 0x21, 0x22],
|
||||
},
|
||||
"JELY": {
|
||||
"classes": ["JELYHACK", "JELYH2"],
|
||||
"slots": [0x01],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def parse_hex(value: str) -> int:
|
||||
return int(value, 16)
|
||||
|
||||
|
||||
def common_prefix_length(blobs: list[bytes]) -> int:
|
||||
if not blobs:
|
||||
return 0
|
||||
limit = min(len(blob) for blob in blobs)
|
||||
for index in range(limit):
|
||||
current = blobs[0][index]
|
||||
if any(blob[index] != current for blob in blobs[1:]):
|
||||
return index
|
||||
return limit
|
||||
|
||||
|
||||
def common_suffix_length(blobs: list[bytes]) -> int:
|
||||
return common_prefix_length([blob[::-1] for blob in blobs])
|
||||
|
||||
|
||||
def first_diff_positions(blobs: list[bytes], limit: int = 8) -> list[int]:
|
||||
positions: list[int] = []
|
||||
max_len = max(len(blob) for blob in blobs)
|
||||
for index in range(max_len):
|
||||
values = {blob[index] if index < len(blob) else None for blob in blobs}
|
||||
if len(values) > 1:
|
||||
positions.append(index)
|
||||
if len(positions) >= limit:
|
||||
break
|
||||
return positions
|
||||
|
||||
|
||||
def load_layouts(targets: set[str]) -> dict[str, dict[str, str]]:
|
||||
layouts: dict[str, dict[str, str]] = {}
|
||||
with LAYOUT_PATH.open("r", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.DictReader(handle, delimiter="\t")
|
||||
for row in reader:
|
||||
if row["class_name_hint"] in targets:
|
||||
layouts[row["class_name_hint"]] = row
|
||||
return layouts
|
||||
|
||||
|
||||
def load_events(targets: set[str]) -> dict[str, list[dict[str, str]]]:
|
||||
events: dict[str, list[dict[str, str]]] = {}
|
||||
with EVENT_PATH.open("r", encoding="utf-8", newline="") as handle:
|
||||
reader = csv.DictReader(handle, delimiter="\t")
|
||||
for row in reader:
|
||||
if row["class_name_hint"] in targets:
|
||||
events.setdefault(row["class_name_hint"], []).append(row)
|
||||
for rows in events.values():
|
||||
rows.sort(key=lambda row: parse_hex(row["slot"]))
|
||||
return events
|
||||
|
||||
|
||||
def resolve_chunk(data_offset: int) -> Path:
|
||||
matches = glob.glob(str(CHUNKS_DIR / f"chunk_*_off_{data_offset:06X}_len_*.bin"))
|
||||
if len(matches) != 1:
|
||||
raise RuntimeError(f"chunk lookup failed for 0x{data_offset:06X}: {matches}")
|
||||
return Path(matches[0])
|
||||
|
||||
|
||||
def build_rows() -> dict[tuple[str, int], dict[str, object]]:
|
||||
targets = {name for family in FAMILIES.values() for name in family["classes"]}
|
||||
layouts = load_layouts(targets)
|
||||
events = load_events(targets)
|
||||
rows_by_key: dict[tuple[str, int], dict[str, object]] = {}
|
||||
|
||||
for class_name, layout in layouts.items():
|
||||
chunk_path = resolve_chunk(parse_hex(layout["data_offset"]))
|
||||
blob = chunk_path.read_bytes()
|
||||
code_base_minus_one = parse_hex(layout["code_base_minus_one"])
|
||||
nonzero_rows = [row for row in events[class_name] if parse_hex(row["raw_code_offset"]) != 0]
|
||||
offsets = sorted({parse_hex(row["raw_code_offset"]) for row in nonzero_rows})
|
||||
|
||||
for row in nonzero_rows:
|
||||
slot = parse_hex(row["slot"])
|
||||
code_offset = parse_hex(row["raw_code_offset"])
|
||||
start = code_base_minus_one + code_offset
|
||||
next_offsets = [offset for offset in offsets if offset > code_offset]
|
||||
end = code_base_minus_one + next_offsets[0] if next_offsets else len(blob)
|
||||
body = blob[start:end]
|
||||
rows_by_key[(class_name, slot)] = {
|
||||
"class_name": class_name,
|
||||
"slot": slot,
|
||||
"event_name_hint": row["event_name_hint"],
|
||||
"raw_event_entry_word": row["raw_event_entry_word"],
|
||||
"raw_code_offset": row["raw_code_offset"],
|
||||
"start": start,
|
||||
"end": end,
|
||||
"length": len(body),
|
||||
"sha1": hashlib.sha1(body).hexdigest(),
|
||||
"preview": body[:16].hex(" "),
|
||||
"chunk_path": str(chunk_path).replace("\\", "/"),
|
||||
"body": body,
|
||||
}
|
||||
|
||||
return rows_by_key
|
||||
|
||||
|
||||
def main() -> None:
|
||||
rows_by_key = build_rows()
|
||||
for family_name, family in FAMILIES.items():
|
||||
print(f"## {family_name}")
|
||||
for slot in family["slots"]:
|
||||
print(f"SLOT 0x{slot:02X}")
|
||||
subset = [rows_by_key[(class_name, slot)] for class_name in family["classes"]]
|
||||
for row in subset:
|
||||
print(
|
||||
"\t".join(
|
||||
[
|
||||
str(row["class_name"]),
|
||||
str(row["event_name_hint"]),
|
||||
str(row["raw_event_entry_word"]),
|
||||
str(row["raw_code_offset"]),
|
||||
f"{row['start']:04X}-{row['end']:04X}",
|
||||
str(row["length"]),
|
||||
str(row["sha1"])[:12],
|
||||
str(row["preview"]),
|
||||
str(row["chunk_path"]),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
groups: dict[str, list[str]] = {}
|
||||
for row in subset:
|
||||
groups.setdefault(str(row["sha1"]), []).append(str(row["class_name"]))
|
||||
blobs = [row["body"] for row in subset]
|
||||
print("identical_groups=" + json.dumps(list(groups.values())))
|
||||
print(
|
||||
f"common_prefix_len={common_prefix_length(blobs)} "
|
||||
f"common_suffix_len={common_suffix_length(blobs)}"
|
||||
)
|
||||
print("first_diff_positions=" + json.dumps(first_diff_positions(blobs)))
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue