Add detailed class event processing and family comparison tools

- Enhance `extract_eusecode_flx.py` to derive class event rows with additional metadata including derived body windows and repeated template statuses.
- Introduce `usecode_family_compare.py` for comparing event families, analyzing commonalities in event bodies, and generating reports on identical groups and differences.
- Implement new data structures for managing class event rows and family artifact specifications.
- Update output formats to include derived body information and repeated family regression checks.
- Ensure robust validation of repeated family expectations against actual extracted data.
This commit is contained in:
MaddoScientisto 2026-03-22 23:24:46 +01:00
commit 4d3c8cd81b
23 changed files with 15033 additions and 14221 deletions

View file

@ -15,6 +15,7 @@ to support the next decoding pass.
from __future__ import annotations
import argparse
import hashlib
import json
import pathlib
import struct
@ -61,6 +62,21 @@ class ExtractedChunk:
class_parse_status: str | None = None
@dataclass(frozen=True)
class ClassEventRow:
entry_index: int
object_index: int
class_id: int
class_name_hint: str
slot: int
event_name_hint: str | None
raw_event_entry_word: int
raw_code_offset: int
derived_body_start: int | None
derived_body_end: int | None
derived_body_length: int | None
@dataclass(frozen=True)
class FlxTable:
entry_count: int
@ -69,6 +85,25 @@ class FlxTable:
entries: list[CandidateEntry]
@dataclass(frozen=True)
class FamilyArtifactSpec:
output_stem: str
title: str
labels: tuple[str, ...]
@dataclass(frozen=True)
class RepeatedFamilyRowExpectation:
class_name: str
slot: int
raw_event_entry_word: int
raw_code_offset: int
derived_body_start: int
derived_body_end: int
derived_body_length: int
repeated_template_status: str
def read_u32_le(data: bytes, offset: int) -> int:
return struct.unpack_from("<I", data, offset)[0]
@ -454,6 +489,73 @@ SCUMMVM_EVENT_NAME_HINTS: tuple[str, ...] = (
)
VERIFIED_REPEATED_TEMPLATE_FAMILIES: tuple[tuple[str, tuple[str, ...]], ...] = (
("referent-anchor-twin", ("JELYHACK", "JELYH2")),
("boot-event-core", ("AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT")),
("callback-eventtrigger", ("SURCAMNS", "SURCAMEW")),
("environmental-event", ("FLAMEBOX", "NOSTRIL", "STEAMBOX")),
)
FAMILY_ARTIFACT_SPECS: tuple[FamilyArtifactSpec, ...] = (
FamilyArtifactSpec(
output_stem="boot_family_decompile",
title="_BOOT Family Decompiled Event Sketches",
labels=("AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT"),
),
FamilyArtifactSpec(
output_stem="callback_family_decompile",
title="SURCAM Callback Family Decompiled Event Sketches",
labels=("SURCAMNS", "SURCAMEW"),
),
FamilyArtifactSpec(
output_stem="environmental_family_decompile",
title="Environmental Family Decompiled Event Sketches",
labels=("FLAMEBOX", "NOSTRIL", "STEAMBOX"),
),
)
VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS: tuple[RepeatedFamilyRowExpectation, ...] = (
RepeatedFamilyRowExpectation("JELYHACK", 0x01, 0x002A, 0x00000001, 0x00D4, 0x00FE, 42, "referent-anchor-twin/shared-slot-0x01/same-length-template"),
RepeatedFamilyRowExpectation("JELYH2", 0x01, 0x002A, 0x00000001, 0x00D4, 0x00FE, 42, "referent-anchor-twin/shared-slot-0x01/same-length-template"),
RepeatedFamilyRowExpectation("AND_BOOT", 0x0A, 0x0253, 0x00000001, 0x00D4, 0x0327, 595, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("AND_BOOT", 0x0F, 0x0237, 0x00000254, 0x0327, 0x055E, 567, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
RepeatedFamilyRowExpectation("AND_BOOT", 0x10, 0x003B, 0x0000048B, 0x055E, 0x0599, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
RepeatedFamilyRowExpectation("BRO_BOOT", 0x0A, 0x02D5, 0x00000001, 0x00D4, 0x03A9, 725, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("BRO_BOOT", 0x0F, 0x024C, 0x000002D6, 0x03A9, 0x05F5, 588, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
RepeatedFamilyRowExpectation("BRO_BOOT", 0x10, 0x003B, 0x00000522, 0x05F5, 0x0630, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
RepeatedFamilyRowExpectation("COR_BOOT", 0x0A, 0x0227, 0x00000001, 0x00D4, 0x02FB, 551, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("COR_BOOT", 0x0F, 0x0234, 0x00000228, 0x02FB, 0x052F, 564, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
RepeatedFamilyRowExpectation("COR_BOOT", 0x10, 0x003B, 0x0000045C, 0x052F, 0x056A, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
RepeatedFamilyRowExpectation("REE_BOOT", 0x0A, 0x034B, 0x00000001, 0x00D4, 0x041F, 843, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("REE_BOOT", 0x0F, 0x025C, 0x0000034C, 0x041F, 0x067B, 604, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
RepeatedFamilyRowExpectation("REE_BOOT", 0x10, 0x003B, 0x000005A8, 0x067B, 0x06B6, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
RepeatedFamilyRowExpectation("VAR_BOOT", 0x0A, 0x029A, 0x00000001, 0x00D4, 0x036E, 666, "boot-event-core/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("VAR_BOOT", 0x0F, 0x0244, 0x0000029B, 0x036E, 0x05B2, 580, "boot-event-core/shared-slot-0x0F/shared-slot-template"),
RepeatedFamilyRowExpectation("VAR_BOOT", 0x10, 0x003B, 0x000004DF, 0x05B2, 0x05ED, 59, "boot-event-core/shared-slot-0x10/same-length-template"),
RepeatedFamilyRowExpectation("SURCAMNS", 0x01, 0x0051, 0x000000D2, 0x01B7, 0x0208, 81, "callback-eventtrigger/shared-slot-0x01/shared-slot-template"),
RepeatedFamilyRowExpectation("SURCAMNS", 0x0A, 0x00D1, 0x00000001, 0x00E6, 0x01B7, 209, "callback-eventtrigger/shared-slot-0x0A/same-length-template"),
RepeatedFamilyRowExpectation("SURCAMNS", 0x20, 0x02BA, 0x00000123, 0x0208, 0x04C2, 698, "callback-eventtrigger/shared-slot-0x20/same-length-template"),
RepeatedFamilyRowExpectation("SURCAMNS", 0x21, 0x0709, 0x000003DD, 0x04C2, 0x0BCB, 1801, "callback-eventtrigger/shared-slot-0x21/shared-slot-template"),
RepeatedFamilyRowExpectation("SURCAMNS", 0x22, 0x01A3, 0x00000AE6, 0x0BCB, 0x0D6E, 419, "callback-eventtrigger/shared-slot-0x22/same-length-template"),
RepeatedFamilyRowExpectation("SURCAMEW", 0x01, 0x00F7, 0x000000D2, 0x01B7, 0x02AE, 247, "callback-eventtrigger/shared-slot-0x01/shared-slot-template"),
RepeatedFamilyRowExpectation("SURCAMEW", 0x0A, 0x00D1, 0x00000001, 0x00E6, 0x01B7, 209, "callback-eventtrigger/shared-slot-0x0A/same-length-template"),
RepeatedFamilyRowExpectation("SURCAMEW", 0x20, 0x02BA, 0x000001C9, 0x02AE, 0x0568, 698, "callback-eventtrigger/shared-slot-0x20/same-length-template"),
RepeatedFamilyRowExpectation("SURCAMEW", 0x21, 0x0655, 0x00000483, 0x0568, 0x0BBD, 1621, "callback-eventtrigger/shared-slot-0x21/shared-slot-template"),
RepeatedFamilyRowExpectation("SURCAMEW", 0x22, 0x01A3, 0x00000AD8, 0x0BBD, 0x0D60, 419, "callback-eventtrigger/shared-slot-0x22/same-length-template"),
RepeatedFamilyRowExpectation("FLAMEBOX", 0x0A, 0x026A, 0x00000001, 0x00E0, 0x034A, 618, "environmental-event/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("FLAMEBOX", 0x20, 0x01AC, 0x0000026B, 0x034A, 0x04F6, 428, "environmental-event/shared-slot-0x20/shared-slot-template"),
RepeatedFamilyRowExpectation("FLAMEBOX", 0x21, 0x029A, 0x00000417, 0x04F6, 0x0790, 666, "environmental-event/shared-slot-0x21/shared-slot-template"),
RepeatedFamilyRowExpectation("NOSTRIL", 0x0A, 0x00C0, 0x00000001, 0x00E0, 0x01A0, 192, "environmental-event/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("NOSTRIL", 0x20, 0x0129, 0x000000C1, 0x01A0, 0x02C9, 297, "environmental-event/shared-slot-0x20/shared-slot-template"),
RepeatedFamilyRowExpectation("NOSTRIL", 0x21, 0x01BE, 0x000001EA, 0x02C9, 0x0487, 446, "environmental-event/shared-slot-0x21/shared-slot-template"),
RepeatedFamilyRowExpectation("STEAMBOX", 0x0A, 0x0266, 0x00000001, 0x00E0, 0x0346, 614, "environmental-event/shared-slot-0x0A/shared-slot-template"),
RepeatedFamilyRowExpectation("STEAMBOX", 0x20, 0x01F6, 0x00000267, 0x0346, 0x053C, 502, "environmental-event/shared-slot-0x20/shared-slot-template"),
RepeatedFamilyRowExpectation("STEAMBOX", 0x21, 0x02A7, 0x0000045D, 0x053C, 0x07E3, 679, "environmental-event/shared-slot-0x21/shared-slot-template"),
)
def scummvm_event_name_hint(slot: int) -> str | None:
if 0 <= slot < len(SCUMMVM_EVENT_NAME_HINTS):
return SCUMMVM_EVENT_NAME_HINTS[slot]
@ -532,6 +634,368 @@ def annotate_class_layout(chunks: list[ExtractedChunk]) -> None:
chunk.class_parse_status = "parsed-class-layout"
def derive_class_event_rows(chunk: ExtractedChunk, raw_data: bytes) -> list[ClassEventRow]:
if chunk.class_parse_status != "parsed-class-layout":
return []
if chunk.object_index is None or chunk.class_id is None or chunk.conservative_event_count is None:
return []
provisional_rows: list[tuple[int, int, int]] = []
for slot in range(chunk.conservative_event_count):
entry_offset = 20 + 6 * slot
raw_word = read_u16_le(raw_data, entry_offset)
raw_code_offset = read_u32_le(raw_data, entry_offset + 2)
provisional_rows.append((slot, raw_word, raw_code_offset))
non_zero_offsets = sorted(
{
raw_code_offset
for _, _, raw_code_offset in provisional_rows
if raw_code_offset != 0
}
)
rows: list[ClassEventRow] = []
for slot, raw_word, raw_code_offset in provisional_rows:
derived_body_start: int | None = None
derived_body_end: int | None = None
derived_body_length: int | None = None
if raw_code_offset != 0 and chunk.code_base_minus_one is not None:
body_start = chunk.code_base_minus_one + raw_code_offset
next_offsets = [offset for offset in non_zero_offsets if offset > raw_code_offset]
body_end = chunk.code_base_minus_one + next_offsets[0] if next_offsets else len(raw_data)
if 0 <= body_start <= body_end <= len(raw_data):
derived_body_start = body_start
derived_body_end = body_end
derived_body_length = body_end - body_start
rows.append(
ClassEventRow(
entry_index=chunk.index,
object_index=chunk.object_index,
class_id=chunk.class_id,
class_name_hint=chunk.class_name_hint or "",
slot=slot,
event_name_hint=scummvm_event_name_hint(slot),
raw_event_entry_word=raw_word,
raw_code_offset=raw_code_offset,
derived_body_start=derived_body_start,
derived_body_end=derived_body_end,
derived_body_length=derived_body_length,
)
)
return rows
def build_class_event_rows(
parsed_class_chunks: list[ExtractedChunk],
) -> tuple[list[ClassEventRow], dict[int, list[ClassEventRow]], dict[int, bytes]]:
all_rows: list[ClassEventRow] = []
rows_by_entry: dict[int, list[ClassEventRow]] = {}
raw_data_by_entry: dict[int, bytes] = {}
for chunk in parsed_class_chunks:
raw_data = pathlib.Path(chunk.raw_path).read_bytes()
raw_data_by_entry[chunk.index] = raw_data
rows = derive_class_event_rows(chunk, raw_data)
rows_by_entry[chunk.index] = rows
all_rows.extend(rows)
return all_rows, rows_by_entry, raw_data_by_entry
def build_repeated_template_status_map(
parsed_class_chunks: list[ExtractedChunk],
rows_by_entry: dict[int, list[ClassEventRow]],
raw_data_by_entry: dict[int, bytes],
) -> dict[tuple[int, int], str]:
status_by_row: dict[tuple[int, int], str] = {}
chunk_by_label = {
chunk.primary_label: chunk
for chunk in parsed_class_chunks
if chunk.primary_label
}
for family_name, labels in VERIFIED_REPEATED_TEMPLATE_FAMILIES:
family_chunks = [chunk_by_label[label] for label in labels if label in chunk_by_label]
if len(family_chunks) < 2:
continue
rows_by_slot: dict[int, list[tuple[ExtractedChunk, ClassEventRow, bytes]]] = {}
for chunk in family_chunks:
raw_data = raw_data_by_entry.get(chunk.index)
if raw_data is None:
continue
for row in rows_by_entry.get(chunk.index, []):
if row.raw_code_offset == 0:
continue
if row.derived_body_start is None or row.derived_body_end is None:
continue
body = raw_data[row.derived_body_start:row.derived_body_end]
rows_by_slot.setdefault(row.slot, []).append((chunk, row, body))
for slot, slot_rows in rows_by_slot.items():
if len(slot_rows) < 2:
continue
lengths = {len(body) for _, _, body in slot_rows}
bodies = {body for _, _, body in slot_rows}
if len(bodies) == 1:
status_suffix = "exact-body-clone"
elif len(lengths) == 1:
status_suffix = "same-length-template"
else:
status_suffix = "shared-slot-template"
status = f"{family_name}/shared-slot-0x{slot:02X}/{status_suffix}"
for chunk, row, _ in slot_rows:
status_by_row[(chunk.index, row.slot)] = status
return status_by_row
def format_optional_hex(value: int | None, width: int = 0) -> str:
if value is None:
return ""
if width > 0:
return f"0x{value:0{width}X}"
return f"0x{value:X}"
def hex_edge(data: bytes, width: int = 8) -> str:
if not data:
return ""
return data[:width].hex()
def hex_tail(data: bytes, width: int = 8) -> str:
if not data:
return ""
return data[-width:].hex()
def write_family_decompile_artifact(
out_dir: pathlib.Path,
parsed_class_chunks: list[ExtractedChunk],
rows_by_entry: dict[int, list[ClassEventRow]],
raw_data_by_entry: dict[int, bytes],
repeated_status_by_row: dict[tuple[int, int], str],
spec: FamilyArtifactSpec,
) -> None:
family_labels = set(spec.labels)
family_chunks = [chunk for chunk in parsed_class_chunks if chunk.primary_label in family_labels]
if not family_chunks:
return
family_chunks.sort(key=lambda chunk: chunk.primary_label or "")
tsv_lines = [
"entry_index\tclass_id\tclass_name\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset\tderived_body_start\tderived_body_end\tderived_body_length\trepeated_template_status\tbody_sha1\tbody_prefix_hex\tbody_suffix_hex"
]
md_lines = [
f"# {spec.title}",
"",
"This is a reversible per-class rendering derived directly from `class_event_index.tsv` plus the raw extracted chunk bytes.",
"ScummVM event labels remain hints only; the authoritative data here is the slot id, raw row bytes, and derived body window.",
"",
]
for chunk in family_chunks:
rows = [row for row in rows_by_entry.get(chunk.index, []) if row.raw_code_offset != 0]
if not rows:
continue
raw_data = raw_data_by_entry[chunk.index]
md_lines.extend([
f"## {chunk.primary_label}",
"",
"```yaml",
"class:",
f" entry_index: 0x{chunk.index:03X}",
f" class_id: 0x{chunk.class_id:X}",
f" class_name: {chunk.primary_label}",
f" class_object_index: 0x{chunk.object_index:X}",
f" raw_code_base_u32: 0x{chunk.raw_code_base_u32:X}",
f" code_base_minus_one: 0x{chunk.code_base_minus_one:X}",
f" conservative_event_count: {chunk.conservative_event_count}",
" events:",
])
for row in rows:
body = b""
if row.derived_body_start is not None and row.derived_body_end is not None:
body = raw_data[row.derived_body_start:row.derived_body_end]
repeated_status = repeated_status_by_row.get((row.entry_index, row.slot), "")
body_sha1 = hashlib.sha1(body).hexdigest() if body else ""
md_lines.extend([
f" - slot: 0x{row.slot:02x}",
f" event_name_hint: {row.event_name_hint or ''}",
f" raw_event_entry_word: 0x{row.raw_event_entry_word:04x}",
f" raw_code_offset: 0x{row.raw_code_offset:08x}",
f" derived_body_start: {format_optional_hex(row.derived_body_start, 4).lower() or 'null'}",
f" derived_body_end: {format_optional_hex(row.derived_body_end, 4).lower() or 'null'}",
f" derived_body_length: {row.derived_body_length if row.derived_body_length is not None else 'null'}",
f" repeated_template_status: {repeated_status or 'unique-or-unclassified'}",
f" body_sha1: {body_sha1 or 'null'}",
f" body_prefix_hex: {hex_edge(body) or 'null'}",
f" body_suffix_hex: {hex_tail(body) or 'null'}",
])
tsv_lines.append(
"{entry_index}\t0x{class_id:X}\t{class_name}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_event_entry_word:04X}\t0x{raw_code_offset:08X}\t{derived_body_start}\t{derived_body_end}\t{derived_body_length}\t{repeated_template_status}\t{body_sha1}\t{body_prefix_hex}\t{body_suffix_hex}".format(
entry_index=row.entry_index,
class_id=row.class_id,
class_name=chunk.primary_label or "",
slot=row.slot,
event_name_hint=row.event_name_hint or "",
raw_event_entry_word=row.raw_event_entry_word,
raw_code_offset=row.raw_code_offset,
derived_body_start=format_optional_hex(row.derived_body_start, 4),
derived_body_end=format_optional_hex(row.derived_body_end, 4),
derived_body_length=(row.derived_body_length if row.derived_body_length is not None else ""),
repeated_template_status=repeated_status,
body_sha1=body_sha1,
body_prefix_hex=hex_edge(body),
body_suffix_hex=hex_tail(body),
)
)
md_lines.extend([
"```",
"",
])
(out_dir / f"{spec.output_stem}.md").write_text("\n".join(md_lines), encoding="utf-8")
(out_dir / f"{spec.output_stem}.tsv").write_text("\n".join(tsv_lines) + "\n", encoding="utf-8")
def validate_verified_repeated_family_regressions(
parsed_class_chunks: list[ExtractedChunk],
rows_by_entry: dict[int, list[ClassEventRow]],
repeated_status_by_row: dict[tuple[int, int], str],
) -> list[str]:
chunk_by_label = {
chunk.primary_label: chunk
for chunk in parsed_class_chunks
if chunk.primary_label
}
expected_slots_by_class: dict[str, set[int]] = {}
for expectation in VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS:
expected_slots_by_class.setdefault(expectation.class_name, set()).add(expectation.slot)
report_lines = [
"record_type\tclass_name\tslot\texpected\tactual\tstatus"
]
errors: list[str] = []
for class_name, expected_slots in sorted(expected_slots_by_class.items()):
chunk = chunk_by_label.get(class_name)
actual_slots: set[int] = set()
if chunk is not None:
actual_slots = {
row.slot
for row in rows_by_entry.get(chunk.index, [])
if row.raw_code_offset != 0
}
status = "ok" if actual_slots == expected_slots else "mismatch"
report_lines.append(
"slot-set\t{class_name}\t*\t{expected}\t{actual}\t{status}".format(
class_name=class_name,
expected=",".join(f"0x{slot:02X}" for slot in sorted(expected_slots)),
actual=",".join(f"0x{slot:02X}" for slot in sorted(actual_slots)),
status=status,
)
)
if status != "ok":
errors.append(
f"{class_name}: expected non-zero slots {sorted(expected_slots)}, found {sorted(actual_slots)}"
)
for expectation in VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS:
chunk = chunk_by_label.get(expectation.class_name)
if chunk is None:
errors.append(f"missing repeated-family class {expectation.class_name}")
report_lines.append(
f"row\t{expectation.class_name}\t0x{expectation.slot:02X}\tpresent\tmissing-class\tmismatch"
)
continue
row = next(
(candidate for candidate in rows_by_entry.get(chunk.index, []) if candidate.slot == expectation.slot),
None,
)
if row is None:
errors.append(f"missing row {expectation.class_name} slot 0x{expectation.slot:02X}")
report_lines.append(
f"row\t{expectation.class_name}\t0x{expectation.slot:02X}\tpresent\tmissing-row\tmismatch"
)
continue
actual_values = (
row.raw_event_entry_word,
row.raw_code_offset,
row.derived_body_start,
row.derived_body_end,
row.derived_body_length,
repeated_status_by_row.get((row.entry_index, row.slot), ""),
)
expected_values = (
expectation.raw_event_entry_word,
expectation.raw_code_offset,
expectation.derived_body_start,
expectation.derived_body_end,
expectation.derived_body_length,
expectation.repeated_template_status,
)
status = "ok" if actual_values == expected_values else "mismatch"
report_lines.append(
"row\t{class_name}\t0x{slot:02X}\t{expected}\t{actual}\t{status}".format(
class_name=expectation.class_name,
slot=expectation.slot,
expected="|".join(
[
f"0x{expectation.raw_event_entry_word:04X}",
f"0x{expectation.raw_code_offset:08X}",
f"0x{expectation.derived_body_start:04X}",
f"0x{expectation.derived_body_end:04X}",
str(expectation.derived_body_length),
expectation.repeated_template_status,
]
),
actual="|".join(
[
f"0x{row.raw_event_entry_word:04X}",
f"0x{row.raw_code_offset:08X}",
format_optional_hex(row.derived_body_start, 4),
format_optional_hex(row.derived_body_end, 4),
str(row.derived_body_length if row.derived_body_length is not None else ""),
repeated_status_by_row.get((row.entry_index, row.slot), ""),
]
),
status=status,
)
)
if status != "ok":
errors.append(
"{class_name} slot 0x{slot:02X}: expected {expected}, found {actual}".format(
class_name=expectation.class_name,
slot=expectation.slot,
expected=expected_values,
actual=actual_values,
)
)
if errors:
raise ValueError(
"repeated-family regression mismatch:\n- " + "\n- ".join(errors)
)
return report_lines
def readable_neighbor_chunks(
center: ExtractedChunk,
chunk_by_index: dict[int, ExtractedChunk],
@ -1556,6 +2020,17 @@ def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes,
"entry_index\tobject_index\tclass_id\tclass_name_hint\traw_code_base_u32\tcode_base_minus_one\tconservative_event_count\tevent_table_end\tclass_parse_status\tdata_offset\tdeclared_size\tprimary_label"
]
parsed_class_chunks = [chunk for chunk in chunks if chunk.class_parse_status == "parsed-class-layout"]
class_event_rows, rows_by_entry, raw_data_by_entry = build_class_event_rows(parsed_class_chunks)
repeated_status_by_row = build_repeated_template_status_map(
parsed_class_chunks,
rows_by_entry,
raw_data_by_entry,
)
repeated_family_regression_lines = validate_verified_repeated_family_regressions(
parsed_class_chunks,
rows_by_entry,
repeated_status_by_row,
)
for chunk in parsed_class_chunks:
class_layout_lines.append(
"{index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{raw_code_base_u32:X}\t0x{code_base_minus_one:X}\t{conservative_event_count}\t0x{event_table_end:X}\t{class_parse_status}\t0x{data_offset:X}\t0x{declared_size:X}\t{primary_label}".format(
@ -1576,28 +2051,39 @@ def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes,
(out_dir / "class_layout_index.tsv").write_text("\n".join(class_layout_lines) + "\n", encoding="utf-8")
class_event_lines = [
"entry_index\tobject_index\tclass_id\tclass_name_hint\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset"
"entry_index\tobject_index\tclass_id\tclass_name_hint\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset\tderived_body_start\tderived_body_end\tderived_body_length\trepeated_template_status"
]
for chunk in parsed_class_chunks:
raw_data = pathlib.Path(chunk.raw_path).read_bytes()
assert chunk.conservative_event_count is not None
for slot in range(chunk.conservative_event_count):
entry_offset = 20 + 6 * slot
raw_word = read_u16_le(raw_data, entry_offset)
raw_code_offset = read_u32_le(raw_data, entry_offset + 2)
class_event_lines.append(
"{entry_index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_word:04X}\t0x{raw_code_offset:08X}".format(
entry_index=chunk.index,
object_index=chunk.object_index,
class_id=chunk.class_id,
class_name_hint=chunk.class_name_hint or "",
slot=slot,
event_name_hint=scummvm_event_name_hint(slot) or "",
raw_word=raw_word,
raw_code_offset=raw_code_offset,
)
for row in class_event_rows:
class_event_lines.append(
"{entry_index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_event_entry_word:04X}\t0x{raw_code_offset:08X}\t{derived_body_start}\t{derived_body_end}\t{derived_body_length}\t{repeated_template_status}".format(
entry_index=row.entry_index,
object_index=row.object_index,
class_id=row.class_id,
class_name_hint=row.class_name_hint,
slot=row.slot,
event_name_hint=row.event_name_hint or "",
raw_event_entry_word=row.raw_event_entry_word,
raw_code_offset=row.raw_code_offset,
derived_body_start=format_optional_hex(row.derived_body_start, 4),
derived_body_end=format_optional_hex(row.derived_body_end, 4),
derived_body_length=(row.derived_body_length if row.derived_body_length is not None else ""),
repeated_template_status=repeated_status_by_row.get((row.entry_index, row.slot), ""),
)
)
(out_dir / "class_event_index.tsv").write_text("\n".join(class_event_lines) + "\n", encoding="utf-8")
for family_artifact_spec in FAMILY_ARTIFACT_SPECS:
write_family_decompile_artifact(
out_dir,
parsed_class_chunks,
rows_by_entry,
raw_data_by_entry,
repeated_status_by_row,
family_artifact_spec,
)
(out_dir / "repeated_family_regressions.tsv").write_text(
"\n".join(repeated_family_regression_lines) + "\n",
encoding="utf-8",
)
neighborhood_lines = [
"center_index\tneighbor_index\tprimary_label\tfield_names\tfield_tags"
@ -1763,7 +2249,9 @@ def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes,
lines.append("- `.strings.txt` files are the main human-readable output for now; `.txt` files are emitted only for chunks that look text-like.")
lines.append("- `descriptor_index.tsv` summarizes guessed class labels, field names, and compact tag patterns for descriptor-like chunks.")
lines.append("- `class_layout_index.tsv` records the conservative owner-loaded class parsing state: object index, class id, class-name hint, raw bytes-8..11 field, derived code-base-minus-one, and event-count/table-end values when the local divisibility and bounds checks succeed.")
lines.append("- `class_event_index.tsv` expands parsed owner-loaded classes into raw 6-byte event rows with slot numbers, ScummVM event-name hints for `0x00..0x1f`, unresolved leading words, and raw code-offset dwords for round-trip tooling work.")
lines.append("- `class_event_index.tsv` now also emits derived body-window columns (`derived_body_start`, `derived_body_end`, `derived_body_length`) plus conservative `repeated_template_status` tags for verified repeated families.")
lines.append("- `boot_family_decompile.md` / `.tsv`, `callback_family_decompile.md` / `.tsv`, and `environmental_family_decompile.md` / `.tsv` now provide reversible per-class decompile artifacts for the `_BOOT`, `SURCAM*`, and environmental repeated-family lanes.")
lines.append("- `repeated_family_regressions.tsv` enforces the current repeated-family slot sets plus the verified raw-row and derived body-window fields for `JELYHACK/JELYH2`, `_BOOT`, `SURCAM*`, and `FLAMEBOX/NOSTRIL/STEAMBOX`.")
lines.append("- `descriptor_neighborhoods.tsv` captures local table neighborhoods around trigger/event-related classes such as `JELYHACK`, `NPCTRIG`, `CRUZTRIG`, `TRIGPAD`, and `SPECIAL`.")
lines.append("- `referent_anchor_event_graph.tsv` groups referent-bearing descriptors with nearby event-bearing neighbors so the attachment model can be inspected without ad hoc grepping.")
lines.append("- `jelyhack_island_graph.md` now uses a wider local window so the `JELYHACK` / `JELYH2` anchors can be inspected alongside the nearby event-bearing `REE_BOOT`, `SURCAMEW`, and `SFXTRIG` descriptors rather than stopping at the referent-only neighbors.")

View file

@ -0,0 +1,165 @@
import csv
import glob
import hashlib
import json
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
LAYOUT_PATH = ROOT / "USECODE" / "EUSECODE_extracted" / "class_layout_index.tsv"
EVENT_PATH = ROOT / "USECODE" / "EUSECODE_extracted" / "class_event_index.tsv"
CHUNKS_DIR = ROOT / "USECODE" / "EUSECODE_extracted" / "chunks"
FAMILIES = {
"BOOT": {
"classes": ["AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT"],
"slots": [0x0A, 0x0F, 0x10],
},
"SURCAM": {
"classes": ["SURCAMNS", "SURCAMEW"],
"slots": [0x20, 0x21, 0x22],
},
"JELY": {
"classes": ["JELYHACK", "JELYH2"],
"slots": [0x01],
},
}
def parse_hex(value: str) -> int:
return int(value, 16)
def common_prefix_length(blobs: list[bytes]) -> int:
if not blobs:
return 0
limit = min(len(blob) for blob in blobs)
for index in range(limit):
current = blobs[0][index]
if any(blob[index] != current for blob in blobs[1:]):
return index
return limit
def common_suffix_length(blobs: list[bytes]) -> int:
return common_prefix_length([blob[::-1] for blob in blobs])
def first_diff_positions(blobs: list[bytes], limit: int = 8) -> list[int]:
positions: list[int] = []
max_len = max(len(blob) for blob in blobs)
for index in range(max_len):
values = {blob[index] if index < len(blob) else None for blob in blobs}
if len(values) > 1:
positions.append(index)
if len(positions) >= limit:
break
return positions
def load_layouts(targets: set[str]) -> dict[str, dict[str, str]]:
layouts: dict[str, dict[str, str]] = {}
with LAYOUT_PATH.open("r", encoding="utf-8", newline="") as handle:
reader = csv.DictReader(handle, delimiter="\t")
for row in reader:
if row["class_name_hint"] in targets:
layouts[row["class_name_hint"]] = row
return layouts
def load_events(targets: set[str]) -> dict[str, list[dict[str, str]]]:
events: dict[str, list[dict[str, str]]] = {}
with EVENT_PATH.open("r", encoding="utf-8", newline="") as handle:
reader = csv.DictReader(handle, delimiter="\t")
for row in reader:
if row["class_name_hint"] in targets:
events.setdefault(row["class_name_hint"], []).append(row)
for rows in events.values():
rows.sort(key=lambda row: parse_hex(row["slot"]))
return events
def resolve_chunk(data_offset: int) -> Path:
matches = glob.glob(str(CHUNKS_DIR / f"chunk_*_off_{data_offset:06X}_len_*.bin"))
if len(matches) != 1:
raise RuntimeError(f"chunk lookup failed for 0x{data_offset:06X}: {matches}")
return Path(matches[0])
def build_rows() -> dict[tuple[str, int], dict[str, object]]:
targets = {name for family in FAMILIES.values() for name in family["classes"]}
layouts = load_layouts(targets)
events = load_events(targets)
rows_by_key: dict[tuple[str, int], dict[str, object]] = {}
for class_name, layout in layouts.items():
chunk_path = resolve_chunk(parse_hex(layout["data_offset"]))
blob = chunk_path.read_bytes()
code_base_minus_one = parse_hex(layout["code_base_minus_one"])
nonzero_rows = [row for row in events[class_name] if parse_hex(row["raw_code_offset"]) != 0]
offsets = sorted({parse_hex(row["raw_code_offset"]) for row in nonzero_rows})
for row in nonzero_rows:
slot = parse_hex(row["slot"])
code_offset = parse_hex(row["raw_code_offset"])
start = code_base_minus_one + code_offset
next_offsets = [offset for offset in offsets if offset > code_offset]
end = code_base_minus_one + next_offsets[0] if next_offsets else len(blob)
body = blob[start:end]
rows_by_key[(class_name, slot)] = {
"class_name": class_name,
"slot": slot,
"event_name_hint": row["event_name_hint"],
"raw_event_entry_word": row["raw_event_entry_word"],
"raw_code_offset": row["raw_code_offset"],
"start": start,
"end": end,
"length": len(body),
"sha1": hashlib.sha1(body).hexdigest(),
"preview": body[:16].hex(" "),
"chunk_path": str(chunk_path).replace("\\", "/"),
"body": body,
}
return rows_by_key
def main() -> None:
rows_by_key = build_rows()
for family_name, family in FAMILIES.items():
print(f"## {family_name}")
for slot in family["slots"]:
print(f"SLOT 0x{slot:02X}")
subset = [rows_by_key[(class_name, slot)] for class_name in family["classes"]]
for row in subset:
print(
"\t".join(
[
str(row["class_name"]),
str(row["event_name_hint"]),
str(row["raw_event_entry_word"]),
str(row["raw_code_offset"]),
f"{row['start']:04X}-{row['end']:04X}",
str(row["length"]),
str(row["sha1"])[:12],
str(row["preview"]),
str(row["chunk_path"]),
]
)
)
groups: dict[str, list[str]] = {}
for row in subset:
groups.setdefault(str(row["sha1"]), []).append(str(row["class_name"]))
blobs = [row["body"] for row in subset]
print("identical_groups=" + json.dumps(list(groups.values())))
print(
f"common_prefix_len={common_prefix_length(blobs)} "
f"common_suffix_len={common_suffix_length(blobs)}"
)
print("first_diff_positions=" + json.dumps(first_diff_positions(blobs)))
print()
if __name__ == "__main__":
main()