#!/usr/bin/env python3 """Extractor for Crusader's EUSECODE.FLX container. Current validated layout: - 0x80-byte header area - little-endian entry count at file offset 0x54 - entry table begins at 0x80 - each entry is 8 bytes: The exact semantics of the payload records are still under RE, so the extractor dumps all non-zero entries and emits human-readable sidecars (.strings.txt and index files) to support the next decoding pass. """ from __future__ import annotations import argparse import hashlib import json import pathlib import struct from dataclasses import asdict, dataclass DEFAULT_INPUT = pathlib.Path(r"k:\ghidra\Crusader_Decomp\USECODE\EUSECODE.FLX") DEFAULT_OUTPUT = pathlib.Path(r"k:\ghidra\Crusader_Decomp\USECODE\EUSECODE_extracted") @dataclass(frozen=True) class CandidateEntry: table_offset: int data_offset: int declared_size: int @dataclass class ExtractedChunk: index: int table_offset: int object_index: int | None data_offset: int declared_size: int next_offset: int | None extracted_size: int overlap_with_next: bool text_like: bool printable_ratio: float zero_ratio: float preview: str raw_path: str strings_path: str text_path: str | None primary_label: str | None field_names: list[str] field_tags: list[str] class_id: int | None = None class_name_hint: str | None = None raw_code_base_u32: int | None = None code_base_minus_one: int | None = None conservative_event_count: int | None = None event_table_end: int | None = None class_parse_status: str | None = None @dataclass(frozen=True) class ClassEventRow: entry_index: int object_index: int class_id: int class_name_hint: str slot: int event_name_hint: str | None raw_event_entry_word: int raw_code_offset: int derived_body_start: int | None derived_body_end: int | None derived_body_length: int | None @dataclass(frozen=True) class FlxTable: entry_count: int table_offset: int table_end: int entries: list[CandidateEntry] @dataclass(frozen=True) class FamilyArtifactSpec: output_stem: str title: str labels: tuple[str, ...] @dataclass(frozen=True) class RepeatedFamilyRowExpectation: class_name: str slot: int raw_event_entry_word: int raw_code_offset: int derived_body_start: int derived_body_end: int derived_body_length: int repeated_template_status: str def read_u32_le(data: bytes, offset: int) -> int: return struct.unpack_from(" int: return struct.unpack_from(" str: preview = [] for byte in data[:limit]: if 0x20 <= byte <= 0x7E: preview.append(chr(byte)) else: preview.append(".") return "".join(preview) def printable_ratio(data: bytes) -> float: if not data: return 0.0 printable = sum(1 for byte in data if byte in (0x09, 0x0A, 0x0D) or 0x20 <= byte <= 0x7E) return printable / len(data) def zero_ratio(data: bytes) -> float: if not data: return 0.0 return data.count(0) / len(data) def iter_printable_runs(data: bytes, min_len: int = 4) -> list[str]: runs: list[str] = [] current = bytearray() for byte in data: if byte in (0x09, 0x0A, 0x0D) or 0x20 <= byte <= 0x7E: current.append(byte) continue if len(current) >= min_len: runs.append(current.decode("latin-1")) current.clear() if len(current) >= min_len: runs.append(current.decode("latin-1")) return runs def summarize_descriptor(strings: list[str]) -> tuple[str | None, list[str]]: label_counts: dict[str, int] = {} field_names: list[str] = [] seen_fields: set[str] = set() for value in strings: if value.isupper() and any(ch.isalpha() for ch in value): label_counts[value] = label_counts.get(value, 0) + 1 continue if value and value[0].islower() and value.replace("_", "").isalnum() and value not in seen_fields: seen_fields.add(value) field_names.append(value) primary_label = None if label_counts: primary_label = sorted(label_counts.items(), key=lambda item: (-item[1], item[0]))[0][0] return primary_label, field_names def extract_field_tag_records(data: bytes, field_names: list[str]) -> list[str]: tags: list[str] = [] seen: set[str] = set() for field_name in field_names: needle = field_name.encode("latin-1") start = 0 while True: pos = data.find(needle, start) if pos < 3: break tag = f"{data[pos - 3]:02X}:{data[pos - 2]:02X}{data[pos - 1]:02X}->{field_name}" if tag not in seen: seen.add(tag) tags.append(tag) start = pos + 1 tags.sort() return tags def has_referent_field(chunk: ExtractedChunk) -> bool: if "referent" in chunk.field_names: return True return any(tag.endswith("->referent") for tag in chunk.field_tags) def get_event_evidence(chunk: ExtractedChunk) -> list[str]: evidence: list[str] = [] seen: set[str] = set() for field_name in chunk.field_names: if "event" not in field_name.lower(): continue marker = f"field:{field_name}" if marker not in seen: seen.add(marker) evidence.append(marker) for field_tag in chunk.field_tags: if "->event" not in field_tag.lower(): continue marker = f"tag:{field_tag}" if marker not in seen: seen.add(marker) evidence.append(marker) return evidence def chunk_role(chunk: ExtractedChunk) -> str: if chunk.primary_label in {"JELYHACK", "JELYH2"}: return "referent-anchor" if get_event_evidence(chunk): return "event-bearing" if has_referent_field(chunk): return "referent-neighbor" return "neighbor" def has_event_trigger_field(chunk: ExtractedChunk) -> bool: if any("eventtrigger" == field_name.lower() for field_name in chunk.field_names): return True return any("->eventtrigger" in field_tag.lower() for field_tag in chunk.field_tags) def event_tag_kind(chunk: ExtractedChunk) -> str: if any("->eventtrigger" in field_tag.lower() for field_tag in chunk.field_tags): return "eventTrigger" if any(field_tag.lower().endswith("->event") for field_tag in chunk.field_tags): return "event" return "" def classify_event_family(chunk: ExtractedChunk) -> str: if event_tag_kind(chunk) == "eventTrigger": return "callback-eventtrigger" if event_tag_kind(chunk) != "event": return "" if chunk.primary_label == "EVENT": return "event-hub" if chunk.primary_label and chunk.primary_label.endswith("_BOOT"): return "boot-event-core" if chunk.field_names == ["referent", "event"]: return "minimal-event-core" if any(name in chunk.field_names for name in ("flame", "flame2", "fire", "fire2", "steam", "steam2")): return "environmental-event" if "typeNpc" in chunk.field_names: return "npc-trigger" return "specialized-event" def readable_role(chunk: ExtractedChunk) -> str: family = classify_event_family(chunk) if family == "event-hub": return "hub" if chunk_role(chunk) == "referent-anchor": return "anchor" if event_tag_kind(chunk) == "event": return "attach" if event_tag_kind(chunk) == "eventTrigger": return "callback" if has_referent_field(chunk): return "neighbor" return "data" def clean_token(value: str) -> str: return " ".join(value.split()) def readable_subject(chunk: ExtractedChunk) -> str: label = clean_token(chunk.primary_label or f"chunk_{chunk.index}") fields = ",".join(clean_token(field_name) for field_name in chunk.field_names) or "?" return f"{label}({fields})" def readable_signature(chunk: ExtractedChunk) -> str: return f"{readable_role(chunk)} {readable_subject(chunk)}" VERIFIED_VM_IR_ROWS: tuple[dict[str, str], ...] = ( { "stage_address": "000d:0988", "ir_name": "APPEND_UNIQUE_INLINE", "opcode_or_lane": "opcode 0x18 (implied sibling)", "payload_shape": "inline referent-chain payload", "evidence": "0x19/0x1a/0x1b compares in 000d:0988 imply 0x18 as append-unique inline sibling", }, { "stage_address": "000d:0988", "ir_name": "APPEND_UNIQUE_INDIRECT", "opcode_or_lane": "opcode 0x19", "payload_shape": "indirect/string-like referent-chain payload", "evidence": "[BP-0x32] == 0x19 path in 000d:0988 with indirect-mode flag", }, { "stage_address": "000d:0988", "ir_name": "REMOVE_MATCHING_INDIRECT", "opcode_or_lane": "opcode 0x1a", "payload_shape": "indirect/string-like referent-chain payload", "evidence": "[BP-0x32] == 0x1a path in 000d:0988 reaches entity_vm_referent_chain_remove_matching_from", }, { "stage_address": "000d:0988", "ir_name": "REMOVE_MATCHING_INLINE", "opcode_or_lane": "opcode 0x1b", "payload_shape": "inline referent-chain payload", "evidence": "[BP-0x32] == 0x1b path in 000d:0988 reaches entity_vm_referent_chain_remove_matching_from without indirect-mode flag", }, { "stage_address": "000d:177c", "ir_name": "PUSH_FRAME_WORD_LITERAL", "opcode_or_lane": "same FUN_000d_ebe3 sequencer family", "payload_shape": "word scalar pushed to stream stack", "evidence": "000d:177c subtracts 2 from [context+0xcc] and stores frame-local word before entity_vm_opcode_finish", }, { "stage_address": "000d:1acb", "ir_name": "COMPARE_STREAM_DWORD_AND_PUSH_BOOL", "opcode_or_lane": "same FUN_000d_ebe3 sequencer family", "payload_shape": "stream dword pair consumed, predicate word emitted", "evidence": "000d:1acb reads one 32-bit pair from stream, compares against AX:DX, pushes boolean word result", }, { "stage_address": "000d:208b", "ir_name": "MATERIALIZE_OR_FORWARD_VALUE", "opcode_or_lane": "slot-backed context consumer", "payload_shape": "materialized slot value or forwarded object result", "evidence": "000d:208b builds one VM context then forwards immediate or object-backed value through shared epilogue", }, { "stage_address": "000d:21ed", "ir_name": "PREPEND_INLINE_PAYLOAD", "opcode_or_lane": "inline payload substage", "payload_shape": "caller-owned blob copied into context +0x102 buffer", "evidence": "000d:21ed prepends caller bytes into backward-growing context buffer before metadata-driven follow-on work", }, { "stage_address": "000d:22bc", "ir_name": "BUILD_ENTITY_LINK_MATRIX", "opcode_or_lane": "inline payload follow-on stage", "payload_shape": "two signed metadata bytes plus streamed entity/link ids", "evidence": "000d:22bc consumes two signed bytes from +0xd6/+0xd8 and streamed words for repeated entity_link calls", }, { "stage_address": "000d:22bc", "ir_name": "EMIT_OR_PUSHBACK_RESULT", "opcode_or_lane": "inline payload follow-on stage", "payload_shape": "stream writeback filter", "evidence": "000d:23da..2421 pushes back only results without 0x0400 list flag before opcode finish", }, { "stage_address": "000d:2104", "ir_name": "FINALIZE_MIXED_VALUE_TO_OUTPTR", "opcode_or_lane": "same FUN_000d_ebe3 sequencer family", "payload_shape": "mixed immediate/object scalar return", "evidence": "000d:2104 writes either frame-local dword or object word with high word cleared to caller out-ptr", }, ) VERIFIED_MASK_LADDER_ROWS: tuple[dict[str, str], ...] = ( { "wrapper_address": "0005:27a4", "mask_pair": "0x0001:0000", "caller_anchor": "000c:a09e entity +0x5b bit 0x0004 branch", "descriptor_bias": "generic active-event-biased lane; no direct class-id bridge", }, { "wrapper_address": "0005:2867", "mask_pair": "0x0002:0001", "caller_anchor": "stores result into entity field +0x39", "descriptor_bias": "active-event ecosystem bias stronger than referent-anchor bias", }, { "wrapper_address": "0005:2ae2", "mask_pair": "0x0004:0002", "caller_anchor": "same verified local mask ladder around entity_vm_context_try_create_masked_for_entity", "descriptor_bias": "active-event ecosystem bias stronger than referent-anchor bias", }, { "wrapper_address": "0005:2c9b", "mask_pair": "0x0010:0004", "caller_anchor": "direct callers at 0005:5946 and 0005:59e9", "descriptor_bias": "active-event ecosystem bias stronger than referent-anchor bias", }, { "wrapper_address": "0005:2918", "mask_pair": "0x0020:0005", "caller_anchor": "+0x3c == 0x20b object lane, carries caller fields +0x36/+0x38", "descriptor_bias": "candidate active-event field bridge; still not descriptor-specific", }, { "wrapper_address": "0005:2c06", "mask_pair": "0x0200:0009", "caller_anchor": "direct caller anchor at 0005:0292", "descriptor_bias": "active-event ecosystem bias stronger than referent-anchor bias", }, { "wrapper_address": "0005:2c35", "mask_pair": "0x0400:000a", "caller_anchor": "xref-dark signed slot-offset wrapper forwarding context +0x34", "descriptor_bias": "offset-specialized masked context creation; descriptor family unresolved", }, { "wrapper_address": "0005:2c68", "mask_pair": "0x0800:000b", "caller_anchor": "xref-dark signed slot-offset wrapper forwarding context +0x34", "descriptor_bias": "offset-specialized masked context creation; descriptor family unresolved", }, { "wrapper_address": "0005:2cd2", "mask_pair": "0x1000:000c", "caller_anchor": "direct caller anchor at 0005:0fee", "descriptor_bias": "active-event ecosystem bias stronger than referent-anchor bias", }, { "wrapper_address": "0004:f05c", "mask_pair": "0x2000:0015", "caller_anchor": "0004:f2b3 overlap/proximity branch with entity byte +0x32 state checks", "descriptor_bias": "gameplay-state lane feeding runtime materialization, not direct descriptor switch", }, { "wrapper_address": "0005:2d01", "mask_pair": "0x4000:000e", "caller_anchor": "direct callers at 0007:814e and 0007:822e", "descriptor_bias": "active-event ecosystem bias stronger than referent-anchor bias", }, { "wrapper_address": "0004:f033", "mask_pair": "0x8000:0007", "caller_anchor": "local wrapper seed recovered from direct instruction evidence", "descriptor_bias": "gameplay-side materialization lane; still descriptor-agnostic", }, { "wrapper_address": "0005:2d30", "mask_pair": "0x8000:000f", "caller_anchor": "entity id/class flag gate plus 0x0f16 / 0x20f dispatch-entry emission path", "descriptor_bias": "strongest current active-event ecosystem candidate in the ladder", }, ) SCUMMVM_EVENT_NAME_HINTS: tuple[str, ...] = ( "look", "use", "anim", "setActivity", "cachein", "hit", "gotHit", "hatch", "schedule", "release", "equip", "unequip", "combine", "func0D", "calledFromAnim", "enterFastArea", "leaveFastArea", "cast", "justMoved", "avatarStoleSomething", "animGetHit", "unhatch", "func16", "func17", "func18", "func19", "func1A", "func1B", "func1C", "func1D", "func1E", "func1F", ) VERIFIED_REPEATED_TEMPLATE_FAMILIES: tuple[tuple[str, tuple[str, ...]], ...] = ( ("referent-anchor-twin", ("JELYHACK", "JELYH2")), ("boot-event-core", ("AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT")), ("callback-eventtrigger", ("SURCAMNS", "SURCAMEW")), ("environmental-event", ("FLAMEBOX", "NOSTRIL", "STEAMBOX")), ) FAMILY_ARTIFACT_SPECS: tuple[FamilyArtifactSpec, ...] = ( FamilyArtifactSpec( output_stem="boot_family_decompile", title="_BOOT Family Decompiled Event Sketches", labels=("AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT"), ), FamilyArtifactSpec( output_stem="callback_family_decompile", title="SURCAM Callback Family Decompiled Event Sketches", labels=("SURCAMNS", "SURCAMEW"), ), FamilyArtifactSpec( output_stem="environmental_family_decompile", title="Environmental Family Decompiled Event Sketches", labels=("FLAMEBOX", "NOSTRIL", "STEAMBOX"), ), ) VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS: tuple[RepeatedFamilyRowExpectation, ...] = ( RepeatedFamilyRowExpectation("JELYHACK", 0x01, 0x002A, 0x00000001, 0x00D4, 0x00FE, 42, "referent-anchor-twin/shared-slot-0x01/same-length-template"), RepeatedFamilyRowExpectation("JELYH2", 0x01, 0x002A, 0x00000001, 0x00D4, 0x00FE, 42, "referent-anchor-twin/shared-slot-0x01/same-length-template"), RepeatedFamilyRowExpectation("AND_BOOT", 0x0A, 0x0253, 0x00000001, 0x00D4, 0x0327, 595, "boot-event-core/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("AND_BOOT", 0x0F, 0x0237, 0x00000254, 0x0327, 0x055E, 567, "boot-event-core/shared-slot-0x0F/shared-slot-template"), RepeatedFamilyRowExpectation("AND_BOOT", 0x10, 0x003B, 0x0000048B, 0x055E, 0x0599, 59, "boot-event-core/shared-slot-0x10/same-length-template"), RepeatedFamilyRowExpectation("BRO_BOOT", 0x0A, 0x02D5, 0x00000001, 0x00D4, 0x03A9, 725, "boot-event-core/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("BRO_BOOT", 0x0F, 0x024C, 0x000002D6, 0x03A9, 0x05F5, 588, "boot-event-core/shared-slot-0x0F/shared-slot-template"), RepeatedFamilyRowExpectation("BRO_BOOT", 0x10, 0x003B, 0x00000522, 0x05F5, 0x0630, 59, "boot-event-core/shared-slot-0x10/same-length-template"), RepeatedFamilyRowExpectation("COR_BOOT", 0x0A, 0x0227, 0x00000001, 0x00D4, 0x02FB, 551, "boot-event-core/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("COR_BOOT", 0x0F, 0x0234, 0x00000228, 0x02FB, 0x052F, 564, "boot-event-core/shared-slot-0x0F/shared-slot-template"), RepeatedFamilyRowExpectation("COR_BOOT", 0x10, 0x003B, 0x0000045C, 0x052F, 0x056A, 59, "boot-event-core/shared-slot-0x10/same-length-template"), RepeatedFamilyRowExpectation("REE_BOOT", 0x0A, 0x034B, 0x00000001, 0x00D4, 0x041F, 843, "boot-event-core/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("REE_BOOT", 0x0F, 0x025C, 0x0000034C, 0x041F, 0x067B, 604, "boot-event-core/shared-slot-0x0F/shared-slot-template"), RepeatedFamilyRowExpectation("REE_BOOT", 0x10, 0x003B, 0x000005A8, 0x067B, 0x06B6, 59, "boot-event-core/shared-slot-0x10/same-length-template"), RepeatedFamilyRowExpectation("VAR_BOOT", 0x0A, 0x029A, 0x00000001, 0x00D4, 0x036E, 666, "boot-event-core/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("VAR_BOOT", 0x0F, 0x0244, 0x0000029B, 0x036E, 0x05B2, 580, "boot-event-core/shared-slot-0x0F/shared-slot-template"), RepeatedFamilyRowExpectation("VAR_BOOT", 0x10, 0x003B, 0x000004DF, 0x05B2, 0x05ED, 59, "boot-event-core/shared-slot-0x10/same-length-template"), RepeatedFamilyRowExpectation("SURCAMNS", 0x01, 0x0051, 0x000000D2, 0x01B7, 0x0208, 81, "callback-eventtrigger/shared-slot-0x01/shared-slot-template"), RepeatedFamilyRowExpectation("SURCAMNS", 0x0A, 0x00D1, 0x00000001, 0x00E6, 0x01B7, 209, "callback-eventtrigger/shared-slot-0x0A/same-length-template"), RepeatedFamilyRowExpectation("SURCAMNS", 0x20, 0x02BA, 0x00000123, 0x0208, 0x04C2, 698, "callback-eventtrigger/shared-slot-0x20/same-length-template"), RepeatedFamilyRowExpectation("SURCAMNS", 0x21, 0x0709, 0x000003DD, 0x04C2, 0x0BCB, 1801, "callback-eventtrigger/shared-slot-0x21/shared-slot-template"), RepeatedFamilyRowExpectation("SURCAMNS", 0x22, 0x01A3, 0x00000AE6, 0x0BCB, 0x0D6E, 419, "callback-eventtrigger/shared-slot-0x22/same-length-template"), RepeatedFamilyRowExpectation("SURCAMEW", 0x01, 0x00F7, 0x000000D2, 0x01B7, 0x02AE, 247, "callback-eventtrigger/shared-slot-0x01/shared-slot-template"), RepeatedFamilyRowExpectation("SURCAMEW", 0x0A, 0x00D1, 0x00000001, 0x00E6, 0x01B7, 209, "callback-eventtrigger/shared-slot-0x0A/same-length-template"), RepeatedFamilyRowExpectation("SURCAMEW", 0x20, 0x02BA, 0x000001C9, 0x02AE, 0x0568, 698, "callback-eventtrigger/shared-slot-0x20/same-length-template"), RepeatedFamilyRowExpectation("SURCAMEW", 0x21, 0x0655, 0x00000483, 0x0568, 0x0BBD, 1621, "callback-eventtrigger/shared-slot-0x21/shared-slot-template"), RepeatedFamilyRowExpectation("SURCAMEW", 0x22, 0x01A3, 0x00000AD8, 0x0BBD, 0x0D60, 419, "callback-eventtrigger/shared-slot-0x22/same-length-template"), RepeatedFamilyRowExpectation("FLAMEBOX", 0x0A, 0x026A, 0x00000001, 0x00E0, 0x034A, 618, "environmental-event/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("FLAMEBOX", 0x20, 0x01AC, 0x0000026B, 0x034A, 0x04F6, 428, "environmental-event/shared-slot-0x20/shared-slot-template"), RepeatedFamilyRowExpectation("FLAMEBOX", 0x21, 0x029A, 0x00000417, 0x04F6, 0x0790, 666, "environmental-event/shared-slot-0x21/shared-slot-template"), RepeatedFamilyRowExpectation("NOSTRIL", 0x0A, 0x00C0, 0x00000001, 0x00E0, 0x01A0, 192, "environmental-event/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("NOSTRIL", 0x20, 0x0129, 0x000000C1, 0x01A0, 0x02C9, 297, "environmental-event/shared-slot-0x20/shared-slot-template"), RepeatedFamilyRowExpectation("NOSTRIL", 0x21, 0x01BE, 0x000001EA, 0x02C9, 0x0487, 446, "environmental-event/shared-slot-0x21/shared-slot-template"), RepeatedFamilyRowExpectation("STEAMBOX", 0x0A, 0x0266, 0x00000001, 0x00E0, 0x0346, 614, "environmental-event/shared-slot-0x0A/shared-slot-template"), RepeatedFamilyRowExpectation("STEAMBOX", 0x20, 0x01F6, 0x00000267, 0x0346, 0x053C, 502, "environmental-event/shared-slot-0x20/shared-slot-template"), RepeatedFamilyRowExpectation("STEAMBOX", 0x21, 0x02A7, 0x0000045D, 0x053C, 0x07E3, 679, "environmental-event/shared-slot-0x21/shared-slot-template"), ) def scummvm_event_name_hint(slot: int) -> str | None: if 0 <= slot < len(SCUMMVM_EVENT_NAME_HINTS): return SCUMMVM_EVENT_NAME_HINTS[slot] return None def object_index_from_table_offset(table_offset: int) -> int | None: if table_offset < 0x80: return None relative = table_offset - 0x80 if relative % 8 != 0: return None return relative // 8 def decode_name_table_entry(name_table_data: bytes, class_id: int) -> str | None: if class_id < 0: return None name_offset = 4 + 13 * class_id if name_offset + 13 > len(name_table_data): return None raw_name = name_table_data[name_offset:name_offset + 13] raw_name = raw_name.split(b"\x00", 1)[0].rstrip(b"\x00 ") if not raw_name: return None return raw_name.decode("latin-1", errors="replace") def annotate_class_layout(chunks: list[ExtractedChunk]) -> None: name_table_data: bytes | None = None for chunk in chunks: chunk.object_index = object_index_from_table_offset(chunk.table_offset) if chunk.object_index == 1: name_table_data = pathlib.Path(chunk.raw_path).read_bytes() chunk.class_parse_status = "name-table" for chunk in chunks: if chunk.object_index is None: chunk.class_parse_status = chunk.class_parse_status or "unaligned-table-offset" continue if chunk.object_index < 2: chunk.class_parse_status = chunk.class_parse_status or "non-class-object" continue chunk.class_id = chunk.object_index - 2 if name_table_data is not None: chunk.class_name_hint = decode_name_table_entry(name_table_data, chunk.class_id) raw_data = pathlib.Path(chunk.raw_path).read_bytes() if len(raw_data) < 20: chunk.class_parse_status = "too-small-for-class-header" continue raw_code_base_u32 = read_u32_le(raw_data, 8) chunk.raw_code_base_u32 = raw_code_base_u32 if raw_code_base_u32 > 0: chunk.code_base_minus_one = raw_code_base_u32 - 1 event_region = raw_code_base_u32 - 20 if event_region < 0: chunk.class_parse_status = "header-before-event-table" continue if event_region % 6 != 0: chunk.class_parse_status = "event-region-not-divisible-by-6" continue event_count = event_region // 6 event_table_end = 20 + event_count * 6 if event_table_end > len(raw_data): chunk.class_parse_status = "event-table-past-object-end" continue chunk.conservative_event_count = event_count chunk.event_table_end = event_table_end chunk.class_parse_status = "parsed-class-layout" def derive_class_event_rows(chunk: ExtractedChunk, raw_data: bytes) -> list[ClassEventRow]: if chunk.class_parse_status != "parsed-class-layout": return [] if chunk.object_index is None or chunk.class_id is None or chunk.conservative_event_count is None: return [] provisional_rows: list[tuple[int, int, int]] = [] for slot in range(chunk.conservative_event_count): entry_offset = 20 + 6 * slot raw_word = read_u16_le(raw_data, entry_offset) raw_code_offset = read_u32_le(raw_data, entry_offset + 2) provisional_rows.append((slot, raw_word, raw_code_offset)) non_zero_offsets = sorted( { raw_code_offset for _, _, raw_code_offset in provisional_rows if raw_code_offset != 0 } ) rows: list[ClassEventRow] = [] for slot, raw_word, raw_code_offset in provisional_rows: derived_body_start: int | None = None derived_body_end: int | None = None derived_body_length: int | None = None if raw_code_offset != 0 and chunk.code_base_minus_one is not None: body_start = chunk.code_base_minus_one + raw_code_offset next_offsets = [offset for offset in non_zero_offsets if offset > raw_code_offset] body_end = chunk.code_base_minus_one + next_offsets[0] if next_offsets else len(raw_data) if 0 <= body_start <= body_end <= len(raw_data): derived_body_start = body_start derived_body_end = body_end derived_body_length = body_end - body_start rows.append( ClassEventRow( entry_index=chunk.index, object_index=chunk.object_index, class_id=chunk.class_id, class_name_hint=chunk.class_name_hint or "", slot=slot, event_name_hint=scummvm_event_name_hint(slot), raw_event_entry_word=raw_word, raw_code_offset=raw_code_offset, derived_body_start=derived_body_start, derived_body_end=derived_body_end, derived_body_length=derived_body_length, ) ) return rows def build_class_event_rows( parsed_class_chunks: list[ExtractedChunk], ) -> tuple[list[ClassEventRow], dict[int, list[ClassEventRow]], dict[int, bytes]]: all_rows: list[ClassEventRow] = [] rows_by_entry: dict[int, list[ClassEventRow]] = {} raw_data_by_entry: dict[int, bytes] = {} for chunk in parsed_class_chunks: raw_data = pathlib.Path(chunk.raw_path).read_bytes() raw_data_by_entry[chunk.index] = raw_data rows = derive_class_event_rows(chunk, raw_data) rows_by_entry[chunk.index] = rows all_rows.extend(rows) return all_rows, rows_by_entry, raw_data_by_entry def build_repeated_template_status_map( parsed_class_chunks: list[ExtractedChunk], rows_by_entry: dict[int, list[ClassEventRow]], raw_data_by_entry: dict[int, bytes], ) -> dict[tuple[int, int], str]: status_by_row: dict[tuple[int, int], str] = {} chunk_by_label = { chunk.primary_label: chunk for chunk in parsed_class_chunks if chunk.primary_label } for family_name, labels in VERIFIED_REPEATED_TEMPLATE_FAMILIES: family_chunks = [chunk_by_label[label] for label in labels if label in chunk_by_label] if len(family_chunks) < 2: continue rows_by_slot: dict[int, list[tuple[ExtractedChunk, ClassEventRow, bytes]]] = {} for chunk in family_chunks: raw_data = raw_data_by_entry.get(chunk.index) if raw_data is None: continue for row in rows_by_entry.get(chunk.index, []): if row.raw_code_offset == 0: continue if row.derived_body_start is None or row.derived_body_end is None: continue body = raw_data[row.derived_body_start:row.derived_body_end] rows_by_slot.setdefault(row.slot, []).append((chunk, row, body)) for slot, slot_rows in rows_by_slot.items(): if len(slot_rows) < 2: continue lengths = {len(body) for _, _, body in slot_rows} bodies = {body for _, _, body in slot_rows} if len(bodies) == 1: status_suffix = "exact-body-clone" elif len(lengths) == 1: status_suffix = "same-length-template" else: status_suffix = "shared-slot-template" status = f"{family_name}/shared-slot-0x{slot:02X}/{status_suffix}" for chunk, row, _ in slot_rows: status_by_row[(chunk.index, row.slot)] = status return status_by_row def format_optional_hex(value: int | None, width: int = 0) -> str: if value is None: return "" if width > 0: return f"0x{value:0{width}X}" return f"0x{value:X}" def hex_edge(data: bytes, width: int = 8) -> str: if not data: return "" return data[:width].hex() def hex_tail(data: bytes, width: int = 8) -> str: if not data: return "" return data[-width:].hex() def write_family_decompile_artifact( out_dir: pathlib.Path, parsed_class_chunks: list[ExtractedChunk], rows_by_entry: dict[int, list[ClassEventRow]], raw_data_by_entry: dict[int, bytes], repeated_status_by_row: dict[tuple[int, int], str], spec: FamilyArtifactSpec, ) -> None: family_labels = set(spec.labels) family_chunks = [chunk for chunk in parsed_class_chunks if chunk.primary_label in family_labels] if not family_chunks: return family_chunks.sort(key=lambda chunk: chunk.primary_label or "") tsv_lines = [ "entry_index\tclass_id\tclass_name\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset\tderived_body_start\tderived_body_end\tderived_body_length\trepeated_template_status\tbody_sha1\tbody_prefix_hex\tbody_suffix_hex" ] md_lines = [ f"# {spec.title}", "", "This is a reversible per-class rendering derived directly from `class_event_index.tsv` plus the raw extracted chunk bytes.", "ScummVM event labels remain hints only; the authoritative data here is the slot id, raw row bytes, and derived body window.", "", ] for chunk in family_chunks: rows = [row for row in rows_by_entry.get(chunk.index, []) if row.raw_code_offset != 0] if not rows: continue raw_data = raw_data_by_entry[chunk.index] md_lines.extend([ f"## {chunk.primary_label}", "", "```yaml", "class:", f" entry_index: 0x{chunk.index:03X}", f" class_id: 0x{chunk.class_id:X}", f" class_name: {chunk.primary_label}", f" class_object_index: 0x{chunk.object_index:X}", f" raw_code_base_u32: 0x{chunk.raw_code_base_u32:X}", f" code_base_minus_one: 0x{chunk.code_base_minus_one:X}", f" conservative_event_count: {chunk.conservative_event_count}", " events:", ]) for row in rows: body = b"" if row.derived_body_start is not None and row.derived_body_end is not None: body = raw_data[row.derived_body_start:row.derived_body_end] repeated_status = repeated_status_by_row.get((row.entry_index, row.slot), "") body_sha1 = hashlib.sha1(body).hexdigest() if body else "" md_lines.extend([ f" - slot: 0x{row.slot:02x}", f" event_name_hint: {row.event_name_hint or ''}", f" raw_event_entry_word: 0x{row.raw_event_entry_word:04x}", f" raw_code_offset: 0x{row.raw_code_offset:08x}", f" derived_body_start: {format_optional_hex(row.derived_body_start, 4).lower() or 'null'}", f" derived_body_end: {format_optional_hex(row.derived_body_end, 4).lower() or 'null'}", f" derived_body_length: {row.derived_body_length if row.derived_body_length is not None else 'null'}", f" repeated_template_status: {repeated_status or 'unique-or-unclassified'}", f" body_sha1: {body_sha1 or 'null'}", f" body_prefix_hex: {hex_edge(body) or 'null'}", f" body_suffix_hex: {hex_tail(body) or 'null'}", ]) tsv_lines.append( "{entry_index}\t0x{class_id:X}\t{class_name}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_event_entry_word:04X}\t0x{raw_code_offset:08X}\t{derived_body_start}\t{derived_body_end}\t{derived_body_length}\t{repeated_template_status}\t{body_sha1}\t{body_prefix_hex}\t{body_suffix_hex}".format( entry_index=row.entry_index, class_id=row.class_id, class_name=chunk.primary_label or "", slot=row.slot, event_name_hint=row.event_name_hint or "", raw_event_entry_word=row.raw_event_entry_word, raw_code_offset=row.raw_code_offset, derived_body_start=format_optional_hex(row.derived_body_start, 4), derived_body_end=format_optional_hex(row.derived_body_end, 4), derived_body_length=(row.derived_body_length if row.derived_body_length is not None else ""), repeated_template_status=repeated_status, body_sha1=body_sha1, body_prefix_hex=hex_edge(body), body_suffix_hex=hex_tail(body), ) ) md_lines.extend([ "```", "", ]) (out_dir / f"{spec.output_stem}.md").write_text("\n".join(md_lines), encoding="utf-8") (out_dir / f"{spec.output_stem}.tsv").write_text("\n".join(tsv_lines) + "\n", encoding="utf-8") def validate_verified_repeated_family_regressions( parsed_class_chunks: list[ExtractedChunk], rows_by_entry: dict[int, list[ClassEventRow]], repeated_status_by_row: dict[tuple[int, int], str], ) -> list[str]: chunk_by_label = { chunk.primary_label: chunk for chunk in parsed_class_chunks if chunk.primary_label } expected_slots_by_class: dict[str, set[int]] = {} for expectation in VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS: expected_slots_by_class.setdefault(expectation.class_name, set()).add(expectation.slot) report_lines = [ "record_type\tclass_name\tslot\texpected\tactual\tstatus" ] errors: list[str] = [] for class_name, expected_slots in sorted(expected_slots_by_class.items()): chunk = chunk_by_label.get(class_name) actual_slots: set[int] = set() if chunk is not None: actual_slots = { row.slot for row in rows_by_entry.get(chunk.index, []) if row.raw_code_offset != 0 } status = "ok" if actual_slots == expected_slots else "mismatch" report_lines.append( "slot-set\t{class_name}\t*\t{expected}\t{actual}\t{status}".format( class_name=class_name, expected=",".join(f"0x{slot:02X}" for slot in sorted(expected_slots)), actual=",".join(f"0x{slot:02X}" for slot in sorted(actual_slots)), status=status, ) ) if status != "ok": errors.append( f"{class_name}: expected non-zero slots {sorted(expected_slots)}, found {sorted(actual_slots)}" ) for expectation in VERIFIED_REPEATED_FAMILY_ROW_EXPECTATIONS: chunk = chunk_by_label.get(expectation.class_name) if chunk is None: errors.append(f"missing repeated-family class {expectation.class_name}") report_lines.append( f"row\t{expectation.class_name}\t0x{expectation.slot:02X}\tpresent\tmissing-class\tmismatch" ) continue row = next( (candidate for candidate in rows_by_entry.get(chunk.index, []) if candidate.slot == expectation.slot), None, ) if row is None: errors.append(f"missing row {expectation.class_name} slot 0x{expectation.slot:02X}") report_lines.append( f"row\t{expectation.class_name}\t0x{expectation.slot:02X}\tpresent\tmissing-row\tmismatch" ) continue actual_values = ( row.raw_event_entry_word, row.raw_code_offset, row.derived_body_start, row.derived_body_end, row.derived_body_length, repeated_status_by_row.get((row.entry_index, row.slot), ""), ) expected_values = ( expectation.raw_event_entry_word, expectation.raw_code_offset, expectation.derived_body_start, expectation.derived_body_end, expectation.derived_body_length, expectation.repeated_template_status, ) status = "ok" if actual_values == expected_values else "mismatch" report_lines.append( "row\t{class_name}\t0x{slot:02X}\t{expected}\t{actual}\t{status}".format( class_name=expectation.class_name, slot=expectation.slot, expected="|".join( [ f"0x{expectation.raw_event_entry_word:04X}", f"0x{expectation.raw_code_offset:08X}", f"0x{expectation.derived_body_start:04X}", f"0x{expectation.derived_body_end:04X}", str(expectation.derived_body_length), expectation.repeated_template_status, ] ), actual="|".join( [ f"0x{row.raw_event_entry_word:04X}", f"0x{row.raw_code_offset:08X}", format_optional_hex(row.derived_body_start, 4), format_optional_hex(row.derived_body_end, 4), str(row.derived_body_length if row.derived_body_length is not None else ""), repeated_status_by_row.get((row.entry_index, row.slot), ""), ] ), status=status, ) ) if status != "ok": errors.append( "{class_name} slot 0x{slot:02X}: expected {expected}, found {actual}".format( class_name=expectation.class_name, slot=expectation.slot, expected=expected_values, actual=actual_values, ) ) if errors: raise ValueError( "repeated-family regression mismatch:\n- " + "\n- ".join(errors) ) return report_lines def readable_neighbor_chunks( center: ExtractedChunk, chunk_by_index: dict[int, ExtractedChunk], total_chunks: int, window: int, ) -> list[ExtractedChunk]: neighbors: list[ExtractedChunk] = [] for neighbor_index in range(max(0, center.index - window), min(total_chunks, center.index + window + 1)): if neighbor_index == center.index: continue neighbor = chunk_by_index[neighbor_index] if not (event_tag_kind(neighbor) or has_referent_field(neighbor) or neighbor.primary_label == center.primary_label): continue neighbors.append(neighbor) return neighbors def unique_preserve_order(values: list[str]) -> list[str]: seen: set[str] = set() ordered: list[str] = [] for value in values: if not value or value in seen: continue seen.add(value) ordered.append(value) return ordered def section_runtime_ops(section_name: str) -> list[str]: if section_name == "Callback trigger lane": return [ "MATERIALIZE_OR_FORWARD_VALUE", "PUSH_FRAME_WORD_LITERAL", "COMPARE_STREAM_DWORD_AND_PUSH_BOOL", "FINALIZE_MIXED_VALUE_TO_OUTPTR", ] return [ "APPEND_UNIQUE_INLINE", "APPEND_UNIQUE_INDIRECT", "REMOVE_MATCHING_INDIRECT", "REMOVE_MATCHING_INLINE", "MATERIALIZE_OR_FORWARD_VALUE", "PREPEND_INLINE_PAYLOAD", "BUILD_ENTITY_LINK_MATRIX", "EMIT_OR_PUSHBACK_RESULT", "FINALIZE_MIXED_VALUE_TO_OUTPTR", ] def section_mask_pairs(section_name: str) -> list[str]: if section_name == "JELYHACK anchor lane": return ["indirect-only active-event-biased ladder; no direct anchor-specific mask proven"] if section_name == "Callback trigger lane": return ["no callback-specific mask pair proven; current ladder favors active event carriers"] return [ "0x0001:0000", "0x0002:0001", "0x0004:0002", "0x0010:0004", "0x0020:0005", "0x0200:0009", "0x0400:000a", "0x0800:000b", "0x1000:000c", "0x2000:0015", "0x4000:000e", "0x8000:0007", "0x8000:000f", ] def section_bridge_note(section_name: str) -> str: if section_name == "JELYHACK anchor lane": return "Referent-only anchors are now readable as payload owners, but the current mask ladder still correlates more strongly with active-event descriptors than with anchor-only rows." if section_name == "Callback trigger lane": return "Callback/eventTrigger descriptors are structurally distinct from the active event lane, so the runtime bridge is still generic slot-backed context flow rather than a callback-specific opcode family." if section_name == "EVENT hub lane": return "This is the strongest current descriptor-side bridge into the active event runtime lane: the neighborhood contains explicit event cores and matches the proven payload-chain plus link-matrix VM behavior." if section_name == "Environmental event lane": return "Environmental descriptors share the same active event field grammar, so they likely ride the same generic VM event lane even though no hazard-specific opcode split is proven yet." return "Descriptor-side and runtime-side evidence align only at the conservative family level." def write_runtime_bridge_reports( out_dir: pathlib.Path, descriptor_chunks: list[ExtractedChunk], chunk_by_index: dict[int, ExtractedChunk], total_chunks: int, ) -> None: vm_ir_lines = [ "stage_address\tir_name\topcode_or_lane\tpayload_shape\tevidence" ] for row in VERIFIED_VM_IR_ROWS: vm_ir_lines.append( "{stage_address}\t{ir_name}\t{opcode_or_lane}\t{payload_shape}\t{evidence}".format(**row) ) (out_dir / "runtime_vm_ir.tsv").write_text("\n".join(vm_ir_lines) + "\n", encoding="utf-8") mask_lines = [ "wrapper_address\tmask_pair\tcaller_anchor\tdescriptor_bias" ] for row in VERIFIED_MASK_LADDER_ROWS: mask_lines.append( "{wrapper_address}\t{mask_pair}\t{caller_anchor}\t{descriptor_bias}".format(**row) ) (out_dir / "vm_mask_ladder.tsv").write_text("\n".join(mask_lines) + "\n", encoding="utf-8") focus_sets = [ ("jelyhack_anchor_attachment", "JELYHACK anchor lane", {"JELYHACK", "JELYH2"}, 8), ("event_hub_cluster", "EVENT hub lane", {"EVENT", "COR_BOOT", "NPCTRIG"}, 5), ("environmental_event_cluster", "Environmental event lane", {"FLAMEBOX", "NOSTRIL", "STEAMBOX"}, 5), ("callback_trigger_cluster", "Callback trigger lane", {"SURCAMNS", "SURCAMEW"}, 5), ] script_tsv_lines = [ "template_id\tsection\tcenter_index\tcenter_label\tattach_labels\tcallback_labels\tneighbor_labels\tevent_families\truntime_ops\tmask_pairs\towner_source\tmirror_write\tselector_status\tbridge_note" ] script_md_lines = [ "# Readable Script IR", "", "This report joins descriptor neighborhoods to the verified 000d VM/runtime lane.", "It stays conservative: opcode-family and mask-family evidence is carried forward only where the binary proves it.", "", "## Verified Runtime Lane", "", "- Owner path: `entity_vm_runtime_init_from_path_if_configured -> entity_vm_runtime_create -> entity_vm_runtime_owner_resource_create`", "- Slot source: `(+0x10/+0x12) + 0x0d*slot + 4` inside the runtime owner/resource object", "- Context seed: `entity_vm_context_create_from_slot_index` copies that source into `+0xd6/+0xd8` and mirrors it to `0x39ca[slot]`", "- Selector status: `0x19/0x1a/0x1b` are proven inside `000d:0988`; `0x18` is still implied, and the upstream seed into `[BP-0x32]` remains unresolved", "", "## Verified VM IR Operators", "", "| Stage | IR | Opcode / Lane | Payload Shape |", "|---|---|---|---|", ] for row in VERIFIED_VM_IR_ROWS: script_md_lines.append( "| {stage_address} | {ir_name} | {opcode_or_lane} | {payload_shape} |".format(**row) ) script_md_lines.extend([ "", "## Verified Mask Ladder", "", "| Wrapper | Mask | Caller Anchor | Descriptor Bias |", "|---|---|---|---|", ]) for row in VERIFIED_MASK_LADDER_ROWS: script_md_lines.append( "| {wrapper_address} | {mask_pair} | {caller_anchor} | {descriptor_bias} |".format(**row) ) owner_source = ( "000d:44df -> 000d:4c99 -> 000d:7000 -> (+0x10/+0x12) + 0x0d*slot + 4" ) mirror_write = "entity_vm_context_create_from_slot_index writes the same source pair to 0x39ca[context_slot]" selector_status = "0x19/0x1a/0x1b proven in 000d:0988; 0x18 implied; upstream [BP-0x32] seed unresolved" for template_id, section_name, center_labels, window in focus_sets: centers = [chunk for chunk in descriptor_chunks if chunk.primary_label in center_labels] if not centers: continue script_md_lines.extend(["", f"## {section_name}", ""]) for center in centers: neighbors = readable_neighbor_chunks(center, chunk_by_index, total_chunks, window) attach_labels = unique_preserve_order([ clean_token(neighbor.primary_label or "") for neighbor in neighbors if event_tag_kind(neighbor) == "event" ]) callback_labels = unique_preserve_order([ clean_token(neighbor.primary_label or "") for neighbor in neighbors if event_tag_kind(neighbor) == "eventTrigger" ]) neighbor_labels = unique_preserve_order([ clean_token(neighbor.primary_label or "") for neighbor in neighbors if has_referent_field(neighbor) and event_tag_kind(neighbor) == "" ]) family_labels = unique_preserve_order([ classify_event_family(neighbor) for neighbor in neighbors ]) runtime_ops = section_runtime_ops(section_name) mask_pairs = section_mask_pairs(section_name) bridge_note = section_bridge_note(section_name) script_tsv_lines.append( "{template_id}\t{section}\t{center_index}\t{center_label}\t{attach_labels}\t{callback_labels}\t{neighbor_labels}\t{event_families}\t{runtime_ops}\t{mask_pairs}\t{owner_source}\t{mirror_write}\t{selector_status}\t{bridge_note}".format( template_id=template_id, section=section_name, center_index=center.index, center_label=clean_token(center.primary_label or ""), attach_labels=",".join(attach_labels), callback_labels=",".join(callback_labels), neighbor_labels=",".join(neighbor_labels), event_families=",".join(family_labels), runtime_ops=",".join(runtime_ops), mask_pairs=",".join(mask_pairs), owner_source=owner_source, mirror_write=mirror_write, selector_status=selector_status, bridge_note=bridge_note, ) ) script_md_lines.append(f"### {center.index}: {center.primary_label}") script_md_lines.append("") script_md_lines.append(f"Descriptor focus: `{readable_signature(center)}`") script_md_lines.append("") script_md_lines.append("Descriptor-side attachments:") script_md_lines.append(f"- Active event neighbors: {', '.join(attach_labels) or 'none proven in window'}") script_md_lines.append(f"- Callback neighbors: {', '.join(callback_labels) or 'none proven in window'}") script_md_lines.append(f"- Referent-side neighbors: {', '.join(neighbor_labels) or 'none proven in window'}") script_md_lines.append(f"- Event families present: {', '.join(family_labels) or 'none'}") script_md_lines.append("") script_md_lines.append("Runtime bridge:") script_md_lines.append(f"- Runtime ops: {', '.join(runtime_ops)}") script_md_lines.append(f"- Mask pairs: {', '.join(mask_pairs)}") script_md_lines.append(f"- Owner source: {owner_source}") script_md_lines.append(f"- Mirror write: {mirror_write}") script_md_lines.append(f"- Selector status: {selector_status}") script_md_lines.append(f"- Interpretation: {bridge_note}") script_md_lines.append("") script_md_lines.append("```text") script_md_lines.append(readable_signature(center)) for label in attach_labels: script_md_lines.append(f"attach {label}(...) # active event-bearing neighbor") for label in callback_labels: script_md_lines.append(f"callback {label}(...) # eventTrigger-bearing neighbor") for label in neighbor_labels: script_md_lines.append(f"near {label}(...) # referent-side local context") script_md_lines.append("") script_md_lines.append("vm_effect:") for runtime_op in runtime_ops: script_md_lines.append(f" {runtime_op}(...)" ) script_md_lines.append("```") script_md_lines.append("") (out_dir / "readable_script_ir.tsv").write_text("\n".join(script_tsv_lines) + "\n", encoding="utf-8") (out_dir / "readable_script_ir.md").write_text("\n".join(script_md_lines), encoding="utf-8") def chunk_bridge_family(chunk: ExtractedChunk) -> str: event_family = classify_event_family(chunk) if event_family: return event_family if chunk_role(chunk) == "referent-anchor": return "referent-anchor" return "" def family_runtime_ops(family: str) -> list[str]: if family == "callback-eventtrigger": return [ "MATERIALIZE_OR_FORWARD_VALUE", "PUSH_FRAME_WORD_LITERAL", "COMPARE_STREAM_DWORD_AND_PUSH_BOOL", "FINALIZE_MIXED_VALUE_TO_OUTPTR", ] if family == "referent-anchor": return [ "APPEND_UNIQUE_INLINE", "APPEND_UNIQUE_INDIRECT", "REMOVE_MATCHING_INDIRECT", "REMOVE_MATCHING_INLINE", "MATERIALIZE_OR_FORWARD_VALUE", ] return [ "APPEND_UNIQUE_INLINE", "APPEND_UNIQUE_INDIRECT", "REMOVE_MATCHING_INDIRECT", "REMOVE_MATCHING_INLINE", "MATERIALIZE_OR_FORWARD_VALUE", "PREPEND_INLINE_PAYLOAD", "BUILD_ENTITY_LINK_MATRIX", "EMIT_OR_PUSHBACK_RESULT", "FINALIZE_MIXED_VALUE_TO_OUTPTR", ] def family_mask_pairs(family: str) -> list[str]: if family == "referent-anchor": return ["anchor role uses referent registry and payload ownership; no anchor-specific slot mask proven"] if family == "callback-eventtrigger": return ["no callback-specific mask pair proven; verified ladder still favors active event carriers"] return [ "0x0001:0000", "0x0002:0001", "0x0004:0002", "0x0010:0004", "0x0020:0005", "0x0200:0009", "0x0400:000a", "0x0800:000b", "0x1000:000c", "0x2000:0015", "0x4000:000e", "0x8000:0007", "0x8000:000f", ] def family_bridge_metadata() -> tuple[dict[str, str], ...]: return ( { "lane_rank": "1", "primary_runtime_lane": "active-event payload lane", "descriptor_family": "event-hub", "fit_strength": "strongest", "confidence": "high", "why": "Explicit 69:0A00 event tag plus the richest source/dest/door/link/time/counter payload shape; best current match for the VM payload-chain plus link-matrix lane.", "exemplar": "EVENT", }, { "lane_rank": "2", "primary_runtime_lane": "active-event payload lane", "descriptor_family": "boot-event-core", "fit_strength": "strong", "confidence": "high", "why": "All five _BOOT descriptors share one compact referent,event,counter,item schema and sit beside referent-heavy object islands that fit the same active-event runtime lane.", "exemplar": "COR_BOOT", }, { "lane_rank": "3", "primary_runtime_lane": "active-event payload lane", "descriptor_family": "npc-trigger", "fit_strength": "strong", "confidence": "moderate-high", "why": "NPCTRIG carries an explicit event field and sits in the same compact event-bearing core as EVENT and COR_BOOT, but its narrower field set makes it look more satellite than hub.", "exemplar": "NPCTRIG", }, { "lane_rank": "4", "primary_runtime_lane": "active-event payload lane", "descriptor_family": "minimal-event-core", "fit_strength": "moderate", "confidence": "moderate", "why": "SFXTRIG keeps the active event tag while stripping most side fields, so it still fits the live event lane but as a smaller attachment form rather than a full hub or boot core.", "exemplar": "SFXTRIG", }, { "lane_rank": "5", "primary_runtime_lane": "active-event payload lane", "descriptor_family": "environmental-event", "fit_strength": "moderate", "confidence": "moderate", "why": "FLAMEBOX, NOSTRIL, and STEAMBOX share the same active event grammar, but no hazard-specific opcode or mask split is proven yet beyond the generic active-event-biased ladder.", "exemplar": "FLAMEBOX", }, { "lane_rank": "6", "primary_runtime_lane": "referent-anchor / payload-owner lane", "descriptor_family": "referent-anchor", "fit_strength": "strong", "confidence": "moderate-high", "why": "JELYHACK and JELYH2 are still referent-only, but the VM referent registry and payload-chain machinery now make that a live anchor role rather than inert metadata.", "exemplar": "JELYHACK", }, { "lane_rank": "7", "primary_runtime_lane": "callback / attachment lane", "descriptor_family": "callback-eventtrigger", "fit_strength": "weak-moderate", "confidence": "moderate", "why": "SURCAMNS and SURCAMEW are structurally coherent callback holders with eventTrigger tags, but the current mask ladder and opcode evidence still align more strongly with active event carriers than callback-specific dispatch.", "exemplar": "SURCAMNS", }, ) def family_chunk_map(descriptor_chunks: list[ExtractedChunk]) -> dict[str, list[ExtractedChunk]]: grouped: dict[str, list[ExtractedChunk]] = {} for chunk in descriptor_chunks: family = chunk_bridge_family(chunk) if not family: continue grouped.setdefault(family, []).append(chunk) return grouped def choose_family_exemplar(family: str, chunks: list[ExtractedChunk], preferred_label: str) -> ExtractedChunk | None: for chunk in chunks: if chunk.primary_label == preferred_label: return chunk return chunks[0] if chunks else None def family_script_block(exemplar: ExtractedChunk, family: str, labels: list[str]) -> list[str]: lines = [readable_signature(exemplar)] if family == "event-hub": lines.extend([ "owner_slot = runtime_owner_table[slot]", "chain = APPEND_UNIQUE_INLINE(...) or APPEND_UNIQUE_INDIRECT(...)", "chain = REMOVE_MATCHING_INDIRECT(...) or REMOVE_MATCHING_INLINE(...)", "payload = PREPEND_INLINE_PAYLOAD(...) when caller bytes are present", "links = BUILD_ENTITY_LINK_MATRIX(shape_a, shape_b, entity_ids)", "emit EVENT-style result through FINALIZE_MIXED_VALUE_TO_OUTPTR(...)", ]) elif family == "boot-event-core": lines.extend([ "anchor referent/event/counter/item into one compact event core", "materialize slot-backed value from runtime_owner_table[slot]", "mutate referent payload chain via opcode_0x18_to_0x1b family", "emit boot-style active event result", ]) elif family == "npc-trigger": lines.extend([ "materialize slot-backed trigger payload", "attach event plus item/item2/typeNpc side fields", "emit NPC-trigger result through shared opcode epilogue", ]) elif family == "minimal-event-core": lines.extend([ "bind referent to minimal event payload", "reuse generic active-event mutation path without hub-style side fields", ]) elif family == "environmental-event": lines.extend([ "bind referent plus event to hazard-specific side fields", "reuse generic active-event lane; no hazard-specific opcode split proven", ]) elif family == "referent-anchor": lines.extend([ "referent_id = registry anchor", "payload_chain = mutable owner-side chain attached to the referent", "neighboring event-bearing descriptor supplies live event semantics", "likely attachments: REE_BOOT, SURCAMEW, SFXTRIG", ]) else: lines.extend([ "callback-side attachment remains descriptor-visible", "runtime bridge is still generic slot-backed context flow rather than callback-specific opcode dispatch", ]) return lines def write_runtime_family_bridge_reports(out_dir: pathlib.Path, descriptor_chunks: list[ExtractedChunk]) -> None: grouped = family_chunk_map(descriptor_chunks) owner_source = "000d:44df -> 000d:4c99 -> 000d:7000 -> (+0x10/+0x12) + 0x0d*slot + 4" loader_evidence = ( "0009:67b6/6916 walk helper-owned +0x10/+0x18 tables, format per-entry paths, and then open/read/close files" ) selector_status = "0x19/0x1a/0x1b proven in 000d:0988; 0x18 implied; upstream [BP-0x32] seed unresolved" tsv_lines = [ "lane_rank\tprimary_runtime_lane\tdescriptor_family\trepresentative_labels\tfit_strength\tconfidence\truntime_ops\tmask_pairs\towner_source\tloader_evidence\tselector_status\twhy" ] md_lines = [ "# Runtime Descriptor Family Rankings", "", "This report ranks descriptor families against the currently verified 000d VM/runtime lanes.", "It is intentionally conservative: it scores ecosystem-level fit, not a direct descriptor-id-to-opcode decode.", "", "## Owner Source", "", f"- Owner path: `{owner_source}`", f"- Loader evidence: `{loader_evidence}`", f"- Selector status: `{selector_status}`", "", "## Ranked Families", "", "| Rank | Runtime Lane | Descriptor Family | Labels | Fit | Confidence |", "|---:|---|---|---|---|---|", ] for metadata in family_bridge_metadata(): family = metadata["descriptor_family"] family_chunks = grouped.get(family, []) if not family_chunks: continue labels = unique_preserve_order([ clean_token(chunk.primary_label or "") for chunk in family_chunks if chunk.primary_label ]) exemplar = choose_family_exemplar(family, family_chunks, metadata["exemplar"]) if exemplar is None: continue runtime_ops = family_runtime_ops(family) mask_pairs = family_mask_pairs(family) tsv_lines.append( "{lane_rank}\t{primary_runtime_lane}\t{descriptor_family}\t{representative_labels}\t{fit_strength}\t{confidence}\t{runtime_ops}\t{mask_pairs}\t{owner_source}\t{loader_evidence}\t{selector_status}\t{why}".format( lane_rank=metadata["lane_rank"], primary_runtime_lane=metadata["primary_runtime_lane"], descriptor_family=family, representative_labels=",".join(labels), fit_strength=metadata["fit_strength"], confidence=metadata["confidence"], runtime_ops=",".join(runtime_ops), mask_pairs=",".join(mask_pairs), owner_source=owner_source, loader_evidence=loader_evidence, selector_status=selector_status, why=metadata["why"], ) ) md_lines.append( "| {lane_rank} | {primary_runtime_lane} | {descriptor_family} | {labels} | {fit_strength} | {confidence} |".format( lane_rank=metadata["lane_rank"], primary_runtime_lane=metadata["primary_runtime_lane"], descriptor_family=family, labels=", ".join(labels), fit_strength=metadata["fit_strength"], confidence=metadata["confidence"], ) ) md_lines.extend([ "", f"## {metadata['lane_rank']}. {family}", "", f"- Runtime lane: {metadata['primary_runtime_lane']}", f"- Labels: {', '.join(labels)}", f"- Fit: {metadata['fit_strength']}", f"- Confidence: {metadata['confidence']}", f"- Why: {metadata['why']}", f"- Runtime ops: {', '.join(runtime_ops)}", f"- Mask pairs: {', '.join(mask_pairs)}", "", "```text", ]) md_lines.extend(family_script_block(exemplar, family, labels)) md_lines.extend([ "```", "", ]) (out_dir / "runtime_descriptor_family_rankings.tsv").write_text("\n".join(tsv_lines) + "\n", encoding="utf-8") (out_dir / "runtime_descriptor_family_rankings.md").write_text("\n".join(md_lines), encoding="utf-8") def write_readable_template_reports( out_dir: pathlib.Path, descriptor_chunks: list[ExtractedChunk], chunk_by_index: dict[int, ExtractedChunk], total_chunks: int, ) -> None: focus_sets = [ ("JELYHACK anchor lane", {"JELYHACK", "JELYH2"}, 8), ("EVENT hub lane", {"EVENT", "COR_BOOT", "NPCTRIG"}, 5), ("Environmental event lane", {"FLAMEBOX", "NOSTRIL", "STEAMBOX"}, 5), ("Callback trigger lane", {"SURCAMNS", "SURCAMEW"}, 5), ] tsv_lines = [ "section\tcenter_index\tcenter_label\trelation\tneighbor_index\tdistance\tneighbor_label\trole\tfamily\ttag_kind\tfield_names" ] md_lines = [ "# EUSECODE Readable Descriptor Templates", "", "These are conservative descriptor-side pseudo-script sketches.", "They reflect verified field grammar and local table neighborhoods, not a direct opcode dump.", "", ] for section_name, center_labels, window in focus_sets: centers = [chunk for chunk in descriptor_chunks if chunk.primary_label in center_labels] if not centers: continue md_lines.append(f"## {section_name}") md_lines.append("") for center in centers: md_lines.append(f"### {center.index}: {center.primary_label}") md_lines.append("") md_lines.append("```text") md_lines.append(readable_signature(center)) for neighbor_index in range(max(0, center.index - window), min(total_chunks, center.index + window + 1)): if neighbor_index == center.index: continue neighbor = chunk_by_index[neighbor_index] if not (event_tag_kind(neighbor) or has_referent_field(neighbor) or neighbor.primary_label in center_labels): continue distance = neighbor.index - center.index relation = "near" if event_tag_kind(neighbor) == "event": relation = "attach" elif event_tag_kind(neighbor) == "eventTrigger": relation = "callback" md_lines.append( "{relation} {subject} # offset {distance:+d}{family_suffix}".format( relation=relation, subject=readable_subject(neighbor), distance=distance, family_suffix=( f", family={classify_event_family(neighbor)}" if classify_event_family(neighbor) else "" ), ) ) tsv_lines.append( "{section}\t{center_index}\t{center_label}\t{relation}\t{neighbor_index}\t{distance:+d}\t{neighbor_label}\t{role}\t{family}\t{tag_kind}\t{field_names}".format( section=section_name, center_index=center.index, center_label=clean_token(center.primary_label or ""), relation=relation, neighbor_index=neighbor.index, distance=distance, neighbor_label=clean_token(neighbor.primary_label or ""), role=chunk_role(neighbor), family=classify_event_family(neighbor), tag_kind=event_tag_kind(neighbor), field_names=",".join(clean_token(field_name) for field_name in neighbor.field_names), ) ) md_lines.append("```") md_lines.append("") family_lines = [ "## Family Signatures", "", "| Family | Label | Signature |", "|---|---|---|", ] for chunk in sorted(descriptor_chunks, key=lambda value: (classify_event_family(value), value.index)): family = classify_event_family(chunk) if not family: continue family_lines.append( "| {family} | {label} | {signature} |".format( family=family, label=clean_token(chunk.primary_label or ""), signature=readable_signature(chunk).replace("|", "/"), ) ) md_lines.extend(family_lines) md_lines.append("") (out_dir / "readable_descriptor_templates.md").write_text("\n".join(md_lines), encoding="utf-8") (out_dir / "readable_descriptor_templates.tsv").write_text("\n".join(tsv_lines) + "\n", encoding="utf-8") def header_u16_words(data: bytes, count: int = 16) -> list[str]: limit = min(len(data) // 2, count) return [f"0x{read_u16_le(data, index * 2):04X}" for index in range(limit)] def header_u32_words(data: bytes, count: int = 8) -> list[str]: limit = min(len(data) // 4, count) return [f"0x{read_u32_le(data, index * 4):08X}" for index in range(limit)] def interesting_printable_markers(data: bytes) -> list[str]: markers: list[str] = [] seen: set[str] = set() for run in iter_printable_runs(data, min_len=3): if not any(token in run for token in ("wx[", "wt$[", "t$t=t@", "$Q", "?\n", "?\r")): continue if run not in seen: seen.add(run) markers.append(run) return markers[:8] def write_island_graph( out_dir: pathlib.Path, output_name: str, title: str, center_labels: set[str], descriptor_chunks: list[ExtractedChunk], chunk_by_index: dict[int, ExtractedChunk], total_chunks: int, window: int = 5, ) -> None: centers = [chunk for chunk in descriptor_chunks if chunk.primary_label in center_labels] if not centers: return island_indices = sorted( { neighbor_index for center in centers for neighbor_index in range(max(0, center.index - window), min(total_chunks, center.index + window + 1)) } ) island_lines = [f"# {title}", "", "## Nodes", "", "| Index | Label | Role | Fields | Event Evidence |", "|---:|---|---|---|---|"] for index in island_indices: chunk = chunk_by_index[index] island_lines.append( "| {index} | {label} | {role} | {fields} | {evidence} |".format( index=index, label=chunk.primary_label or "", role=chunk_role(chunk), fields=",".join(chunk.field_names) or "-", evidence=",".join(get_event_evidence(chunk)) or "-", ) ) island_lines.extend(["", "## Edges", "", "| Source | Relation | Target | Evidence |", "|---|---|---|---|"]) for center in centers: for neighbor_index in range(max(0, center.index - window), min(total_chunks, center.index + window + 1)): if neighbor_index == center.index: continue neighbor = chunk_by_index[neighbor_index] relation = f"table-neighbor({neighbor.index - center.index:+d})" event_evidence = get_event_evidence(neighbor) if event_evidence: relation = f"possible-event-attachment({neighbor.index - center.index:+d})" island_lines.append( "| {source} ({source_index}) | {relation} | {target} ({target_index}) | {evidence} |".format( source=center.primary_label, source_index=center.index, relation=relation, target=neighbor.primary_label or "", target_index=neighbor.index, evidence=",".join(event_evidence) or "same local extraction neighborhood", ) ) (out_dir / output_name).write_text("\n".join(island_lines) + "\n", encoding="utf-8") def write_descriptor_compare( out_dir: pathlib.Path, output_name: str, labels: set[str], descriptor_chunks: list[ExtractedChunk], ) -> None: compare_lines = [ "entry_index\tlabel\trole\tdata_offset\tdeclared_size\theader_u16\theader_u32\tprintable_markers\tfield_tags" ] for chunk in descriptor_chunks: if chunk.primary_label not in labels: continue raw_data = pathlib.Path(chunk.raw_path).read_bytes() compare_lines.append( "{index}\t{label}\t{role}\t0x{data_offset:X}\t0x{declared_size:X}\t{header_u16}\t{header_u32}\t{markers}\t{field_tags}".format( index=chunk.index, label=chunk.primary_label, role=chunk_role(chunk), data_offset=chunk.data_offset, declared_size=chunk.declared_size, header_u16=",".join(header_u16_words(raw_data)), header_u32=",".join(header_u32_words(raw_data)), markers="|".join(interesting_printable_markers(raw_data)), field_tags=",".join(chunk.field_tags), ) ) (out_dir / output_name).write_text("\n".join(compare_lines) + "\n", encoding="utf-8") def write_event_family_reports( out_dir: pathlib.Path, descriptor_chunks: list[ExtractedChunk], chunk_by_index: dict[int, ExtractedChunk], total_chunks: int, ) -> None: family_lines = [ "entry_index\tlabel\tfamily\ttag_kind\trole\tfield_count\tfield_names\tfield_tags\tdata_offset\tdeclared_size\tlocal_event_neighbors" ] families: dict[str, list[ExtractedChunk]] = {} for chunk in descriptor_chunks: family = classify_event_family(chunk) if not family: continue families.setdefault(family, []).append(chunk) local_event_neighbors = 0 for neighbor_index in range(max(0, chunk.index - 5), min(total_chunks, chunk.index + 6)): if neighbor_index == chunk.index: continue neighbor = chunk_by_index[neighbor_index] if event_tag_kind(neighbor): local_event_neighbors += 1 family_lines.append( "{index}\t{label}\t{family}\t{tag_kind}\t{role}\t{field_count}\t{field_names}\t{field_tags}\t0x{data_offset:X}\t0x{declared_size:X}\t{local_event_neighbors}".format( index=chunk.index, label=chunk.primary_label or "", family=family, tag_kind=event_tag_kind(chunk), role=chunk_role(chunk), field_count=len(chunk.field_names), field_names=",".join(chunk.field_names), field_tags=",".join(chunk.field_tags), data_offset=chunk.data_offset, declared_size=chunk.declared_size, local_event_neighbors=local_event_neighbors, ) ) (out_dir / "event_family_index.tsv").write_text("\n".join(family_lines) + "\n", encoding="utf-8") summary_lines = ["# Event Family Summary", ""] family_order = [ "event-hub", "boot-event-core", "npc-trigger", "minimal-event-core", "environmental-event", "specialized-event", "callback-eventtrigger", ] for family in family_order: family_chunks = families.get(family, []) if not family_chunks: continue summary_lines.append(f"## {family}") summary_lines.append("") summary_lines.append("| Index | Label | Tag Kind | Fields | Size | Local Event Neighbors |") summary_lines.append("|---:|---|---|---|---:|---:|") for chunk in sorted(family_chunks, key=lambda value: value.index): local_event_neighbors = 0 for neighbor_index in range(max(0, chunk.index - 5), min(total_chunks, chunk.index + 6)): if neighbor_index == chunk.index: continue neighbor = chunk_by_index[neighbor_index] if event_tag_kind(neighbor): local_event_neighbors += 1 summary_lines.append( "| {index} | {label} | {tag_kind} | {fields} | 0x{declared_size:X} | {local_event_neighbors} |".format( index=chunk.index, label=chunk.primary_label or "", tag_kind=event_tag_kind(chunk), fields=",".join(chunk.field_names) or "-", declared_size=chunk.declared_size, local_event_neighbors=local_event_neighbors, ) ) summary_lines.append("") (out_dir / "event_family_summary.md").write_text("\n".join(summary_lines), encoding="utf-8") def looks_text_like(data: bytes) -> bool: if not data: return False ratio = printable_ratio(data) if ratio < 0.80: return False if b"\r\n" in data or b"\n" in data: return True return zero_ratio(data) < 0.05 def parse_flx_table(data: bytes, table_offset: int = 0x80, count_offset: int = 0x54) -> FlxTable: file_size = len(data) entry_count = read_u32_le(data, count_offset) table_end = table_offset + entry_count * 8 if table_end > file_size: raise ValueError( f"FLX table extends past EOF: entry_count={entry_count} table_end=0x{table_end:X} file_size=0x{file_size:X}" ) entries: list[CandidateEntry] = [] for index in range(entry_count): offset = table_offset + index * 8 data_offset = read_u32_le(data, offset) declared_size = read_u32_le(data, offset + 4) if data_offset == 0 and declared_size == 0: continue if data_offset <= 0 or data_offset > file_size: continue if declared_size <= 0: continue entries.append(CandidateEntry(offset, data_offset, declared_size)) return FlxTable( entry_count=entry_count, table_offset=table_offset, table_end=table_end, entries=entries, ) def dump_chunk( base_dir: pathlib.Path, chunk_name: str, data: bytes ) -> tuple[str, str, str | None, bool, float, float, str, str | None, list[str], list[str]]: raw_path = base_dir / f"{chunk_name}.bin" strings_path = base_dir / f"{chunk_name}.strings.txt" text_path = base_dir / f"{chunk_name}.txt" raw_path.write_bytes(data) runs = iter_printable_runs(data) strings_path.write_text("\n".join(runs) + ("\n" if runs else ""), encoding="utf-8") primary_label, field_names = summarize_descriptor(runs) field_tags = extract_field_tag_records(data, field_names) text_like = looks_text_like(data) actual_text_path: str | None = None if text_like: text_path.write_text(data.decode("latin-1", errors="replace"), encoding="utf-8") actual_text_path = str(text_path) return ( str(raw_path), str(strings_path), actual_text_path, text_like, printable_ratio(data), zero_ratio(data), ascii_preview(data), primary_label, field_names, field_tags, ) def extract_candidates(data: bytes, out_dir: pathlib.Path, entries: list[CandidateEntry]) -> list[ExtractedChunk]: chunks_dir = out_dir / "chunks" chunks_dir.mkdir(parents=True, exist_ok=True) extracted: list[ExtractedChunk] = [] file_size = len(data) sorted_entries = sorted(enumerate(entries), key=lambda item: (item[1].data_offset, item[0])) next_by_original_index: dict[int, int | None] = {} for position, (original_index, entry) in enumerate(sorted_entries): next_offset = sorted_entries[position + 1][1].data_offset if position + 1 < len(sorted_entries) else None next_by_original_index[original_index] = next_offset for index, entry in enumerate(entries): next_offset = next_by_original_index.get(index) chunk_end = min(file_size, entry.data_offset + entry.declared_size) chunk_data = data[entry.data_offset:chunk_end] overlap = next_offset is not None and (entry.data_offset + entry.declared_size) > next_offset chunk_name = ( f"chunk_{index:03d}_table_{entry.table_offset:04X}_off_{entry.data_offset:06X}_len_{entry.declared_size:06X}" ) raw_path, strings_path, text_path, text_like, print_ratio, z_ratio, preview, primary_label, field_names, field_tags = dump_chunk( chunks_dir, chunk_name, chunk_data ) extracted.append( ExtractedChunk( index=index, table_offset=entry.table_offset, object_index=object_index_from_table_offset(entry.table_offset), data_offset=entry.data_offset, declared_size=entry.declared_size, next_offset=next_offset, extracted_size=len(chunk_data), overlap_with_next=overlap, text_like=text_like, printable_ratio=round(print_ratio, 4), zero_ratio=round(z_ratio, 4), preview=preview, raw_path=raw_path, strings_path=strings_path, text_path=text_path, primary_label=primary_label, field_names=field_names, field_tags=field_tags, ) ) annotate_class_layout(extracted) return extracted def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes, entries: list[CandidateEntry], chunks: list[ExtractedChunk]) -> None: summary = { "input_path": str(input_path), "file_size": len(data), "header_preview_hex": data[:128].hex(), "header_preview_ascii": ascii_preview(data[:128], 128), "candidate_entries": [asdict(entry) for entry in entries], "chunks": [asdict(chunk) for chunk in chunks], } (out_dir / "summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8") index_lines = [ "entry_index\ttable_offset\tobject_index\tclass_id\tclass_name_hint\traw_code_base_u32\tcode_base_minus_one\tconservative_event_count\tevent_table_end\tclass_parse_status\tdata_offset\tdeclared_size\textracted_size\ttext_like\tprintable_ratio\tzero_ratio\toverlap_with_next\tprimary_label\tfield_names\tfield_tags\tpreview" ] for chunk in chunks: index_lines.append( "{index}\t{table_offset}\t{object_index}\t{class_id}\t{class_name_hint}\t{raw_code_base_u32}\t{code_base_minus_one}\t{conservative_event_count}\t{event_table_end}\t{class_parse_status}\t{data_offset}\t{declared_size}\t{extracted_size}\t{text_like}\t{printable_ratio:.4f}\t{zero_ratio:.4f}\t{overlap}\t{primary_label}\t{field_names}\t{field_tags}\t{preview}".format( index=chunk.index, table_offset=(f"0x{chunk.table_offset:X}"), object_index=(f"0x{chunk.object_index:X}" if chunk.object_index is not None else ""), class_id=(f"0x{chunk.class_id:X}" if chunk.class_id is not None else ""), class_name_hint=chunk.class_name_hint or "", raw_code_base_u32=(f"0x{chunk.raw_code_base_u32:X}" if chunk.raw_code_base_u32 is not None else ""), code_base_minus_one=(f"0x{chunk.code_base_minus_one:X}" if chunk.code_base_minus_one is not None else ""), conservative_event_count=(chunk.conservative_event_count if chunk.conservative_event_count is not None else ""), event_table_end=(f"0x{chunk.event_table_end:X}" if chunk.event_table_end is not None else ""), class_parse_status=chunk.class_parse_status or "", data_offset=f"0x{chunk.data_offset:X}", declared_size=f"0x{chunk.declared_size:X}", extracted_size=f"0x{chunk.extracted_size:X}", text_like=int(chunk.text_like), printable_ratio=chunk.printable_ratio, zero_ratio=chunk.zero_ratio, overlap=int(chunk.overlap_with_next), primary_label=chunk.primary_label or "", field_names=",".join(chunk.field_names), field_tags=",".join(chunk.field_tags), preview=chunk.preview.replace("\t", " "), ) ) (out_dir / "entry_index.tsv").write_text("\n".join(index_lines) + "\n", encoding="utf-8") descriptor_lines = [ "entry_index\tobject_index\tclass_id\tclass_name_hint\traw_code_base_u32\tcode_base_minus_one\tconservative_event_count\tevent_table_end\tclass_parse_status\tprimary_label\tfield_names\tfield_tags\tdata_offset\tdeclared_size" ] descriptor_chunks = [chunk for chunk in chunks if chunk.primary_label or chunk.field_names] for chunk in descriptor_chunks: descriptor_lines.append( "{index}\t{object_index}\t{class_id}\t{class_name_hint}\t{raw_code_base_u32}\t{code_base_minus_one}\t{conservative_event_count}\t{event_table_end}\t{class_parse_status}\t{primary_label}\t{field_names}\t{field_tags}\t0x{data_offset:X}\t0x{declared_size:X}".format( index=chunk.index, object_index=(f"0x{chunk.object_index:X}" if chunk.object_index is not None else ""), class_id=(f"0x{chunk.class_id:X}" if chunk.class_id is not None else ""), class_name_hint=chunk.class_name_hint or "", raw_code_base_u32=(f"0x{chunk.raw_code_base_u32:X}" if chunk.raw_code_base_u32 is not None else ""), code_base_minus_one=(f"0x{chunk.code_base_minus_one:X}" if chunk.code_base_minus_one is not None else ""), conservative_event_count=(chunk.conservative_event_count if chunk.conservative_event_count is not None else ""), event_table_end=(f"0x{chunk.event_table_end:X}" if chunk.event_table_end is not None else ""), class_parse_status=chunk.class_parse_status or "", primary_label=chunk.primary_label or "", field_names=",".join(chunk.field_names), field_tags=",".join(chunk.field_tags), data_offset=chunk.data_offset, declared_size=chunk.declared_size, ) ) (out_dir / "descriptor_index.tsv").write_text("\n".join(descriptor_lines) + "\n", encoding="utf-8") class_layout_lines = [ "entry_index\tobject_index\tclass_id\tclass_name_hint\traw_code_base_u32\tcode_base_minus_one\tconservative_event_count\tevent_table_end\tclass_parse_status\tdata_offset\tdeclared_size\tprimary_label" ] parsed_class_chunks = [chunk for chunk in chunks if chunk.class_parse_status == "parsed-class-layout"] class_event_rows, rows_by_entry, raw_data_by_entry = build_class_event_rows(parsed_class_chunks) repeated_status_by_row = build_repeated_template_status_map( parsed_class_chunks, rows_by_entry, raw_data_by_entry, ) repeated_family_regression_lines = validate_verified_repeated_family_regressions( parsed_class_chunks, rows_by_entry, repeated_status_by_row, ) for chunk in parsed_class_chunks: class_layout_lines.append( "{index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{raw_code_base_u32:X}\t0x{code_base_minus_one:X}\t{conservative_event_count}\t0x{event_table_end:X}\t{class_parse_status}\t0x{data_offset:X}\t0x{declared_size:X}\t{primary_label}".format( index=chunk.index, object_index=chunk.object_index, class_id=chunk.class_id, class_name_hint=chunk.class_name_hint or "", raw_code_base_u32=chunk.raw_code_base_u32, code_base_minus_one=chunk.code_base_minus_one, conservative_event_count=chunk.conservative_event_count, event_table_end=chunk.event_table_end, class_parse_status=chunk.class_parse_status, data_offset=chunk.data_offset, declared_size=chunk.declared_size, primary_label=chunk.primary_label or "", ) ) (out_dir / "class_layout_index.tsv").write_text("\n".join(class_layout_lines) + "\n", encoding="utf-8") class_event_lines = [ "entry_index\tobject_index\tclass_id\tclass_name_hint\tslot\tevent_name_hint\traw_event_entry_word\traw_code_offset\tderived_body_start\tderived_body_end\tderived_body_length\trepeated_template_status" ] for row in class_event_rows: class_event_lines.append( "{entry_index}\t0x{object_index:X}\t0x{class_id:X}\t{class_name_hint}\t0x{slot:02X}\t{event_name_hint}\t0x{raw_event_entry_word:04X}\t0x{raw_code_offset:08X}\t{derived_body_start}\t{derived_body_end}\t{derived_body_length}\t{repeated_template_status}".format( entry_index=row.entry_index, object_index=row.object_index, class_id=row.class_id, class_name_hint=row.class_name_hint, slot=row.slot, event_name_hint=row.event_name_hint or "", raw_event_entry_word=row.raw_event_entry_word, raw_code_offset=row.raw_code_offset, derived_body_start=format_optional_hex(row.derived_body_start, 4), derived_body_end=format_optional_hex(row.derived_body_end, 4), derived_body_length=(row.derived_body_length if row.derived_body_length is not None else ""), repeated_template_status=repeated_status_by_row.get((row.entry_index, row.slot), ""), ) ) (out_dir / "class_event_index.tsv").write_text("\n".join(class_event_lines) + "\n", encoding="utf-8") for family_artifact_spec in FAMILY_ARTIFACT_SPECS: write_family_decompile_artifact( out_dir, parsed_class_chunks, rows_by_entry, raw_data_by_entry, repeated_status_by_row, family_artifact_spec, ) (out_dir / "repeated_family_regressions.tsv").write_text( "\n".join(repeated_family_regression_lines) + "\n", encoding="utf-8", ) neighborhood_lines = [ "center_index\tneighbor_index\tprimary_label\tfield_names\tfield_tags" ] interesting = {"JELYHACK", "JELYH2", "NPCTRIG", "CRUZTRIG", "TRIGPAD", "SPECIAL", "EVENT", "SFXTRIG"} interesting_indices = [chunk.index for chunk in chunks if chunk.primary_label in interesting] seen_pairs: set[tuple[int, int]] = set() chunk_by_index = {chunk.index: chunk for chunk in chunks} for center_index in interesting_indices: for neighbor_index in range(max(0, center_index - 4), min(len(chunks), center_index + 5)): pair = (center_index, neighbor_index) if pair in seen_pairs: continue seen_pairs.add(pair) chunk = chunk_by_index[neighbor_index] neighborhood_lines.append( "{center_index}\t{neighbor_index}\t{primary_label}\t{field_names}\t{field_tags}".format( center_index=center_index, neighbor_index=neighbor_index, primary_label=chunk.primary_label or "", field_names=",".join(chunk.field_names), field_tags=",".join(chunk.field_tags), ) ) (out_dir / "descriptor_neighborhoods.tsv").write_text("\n".join(neighborhood_lines) + "\n", encoding="utf-8") anchor_graph_lines = [ "anchor_index\tanchor_label\tanchor_fields\tneighbor_index\tdistance\tneighbor_label\tneighbor_fields\tneighbor_role\tevent_evidence" ] for anchor in descriptor_chunks: if not anchor.primary_label or not has_referent_field(anchor): continue for neighbor_index in range(max(0, anchor.index - 8), min(len(chunks), anchor.index + 9)): if neighbor_index == anchor.index: continue neighbor = chunk_by_index[neighbor_index] event_evidence = get_event_evidence(neighbor) if not event_evidence: continue anchor_graph_lines.append( "{anchor_index}\t{anchor_label}\t{anchor_fields}\t{neighbor_index}\t{distance:+d}\t{neighbor_label}\t{neighbor_fields}\t{neighbor_role}\t{event_evidence}".format( anchor_index=anchor.index, anchor_label=anchor.primary_label, anchor_fields=",".join(anchor.field_names), neighbor_index=neighbor.index, distance=neighbor.index - anchor.index, neighbor_label=neighbor.primary_label or "", neighbor_fields=",".join(neighbor.field_names), neighbor_role=chunk_role(neighbor), event_evidence=",".join(event_evidence), ) ) (out_dir / "referent_anchor_event_graph.tsv").write_text("\n".join(anchor_graph_lines) + "\n", encoding="utf-8") write_island_graph( out_dir, "jelyhack_island_graph.md", "JELYHACK Island Graph", {"JELYHACK", "JELYH2"}, descriptor_chunks, chunk_by_index, len(chunks), window=8, ) write_descriptor_compare( out_dir, "jelyhack_descriptor_compare.tsv", {"JELYHACK", "JELYH2", "REE_BOOT", "SURCAMEW", "SFXTRIG"}, descriptor_chunks, ) write_island_graph( out_dir, "event_island_graph.md", "EVENT Cluster Graph", {"EVENT", "COR_BOOT", "NPCTRIG", "ROLL_NS", "CRUZTRIG"}, descriptor_chunks, chunk_by_index, len(chunks), ) write_descriptor_compare( out_dir, "event_descriptor_compare.tsv", {"ROLL_NS", "COR_BOOT", "EVENT", "NPCTRIG", "CRUZTRIG", "NPC_ONLY", "VMAIL"}, descriptor_chunks, ) write_island_graph( out_dir, "boot_frontier_graph.md", "AND/BRO Boot Frontier Graph", {"AND_BOOT", "BRO_BOOT"}, descriptor_chunks, chunk_by_index, len(chunks), window=6, ) write_descriptor_compare( out_dir, "boot_family_compare.tsv", {"AND_BOOT", "BRO_BOOT", "COR_BOOT", "VAR_BOOT", "REE_BOOT"}, descriptor_chunks, ) write_island_graph( out_dir, "environmental_event_graph.md", "Environmental Event Graph", {"FLAMEBOX", "NOSTRIL", "STEAMBOX"}, descriptor_chunks, chunk_by_index, len(chunks), window=5, ) write_descriptor_compare( out_dir, "environmental_family_compare.tsv", {"FLAMEBOX", "NOSTRIL", "STEAMBOX"}, descriptor_chunks, ) write_descriptor_compare( out_dir, "callback_trigger_compare.tsv", {"SURCAMNS", "SURCAMEW"}, descriptor_chunks, ) write_event_family_reports(out_dir, descriptor_chunks, chunk_by_index, len(chunks)) write_readable_template_reports(out_dir, descriptor_chunks, chunk_by_index, len(chunks)) write_runtime_bridge_reports(out_dir, descriptor_chunks, chunk_by_index, len(chunks)) write_runtime_family_bridge_reports(out_dir, descriptor_chunks) lines = [] lines.append("# EUSECODE.FLX First-Pass Extraction") lines.append("") lines.append(f"Input: {input_path}") lines.append(f"File size: 0x{len(data):X} ({len(data)} bytes)") lines.append(f"Candidate entries: {len(entries)}") lines.append("") lines.append("## Header Preview") lines.append("") lines.append(f"ASCII: `{ascii_preview(data[:128], 128)}`") lines.append("") lines.append("## Chunks") lines.append("") lines.append("| # | Table Off | Data Off | Declared Size | Next Off | Text | Overlap | Preview |") lines.append("|---:|---:|---:|---:|---:|:---:|:---:|---|") for chunk in chunks: next_off = f"0x{chunk.next_offset:X}" if chunk.next_offset is not None else "-" lines.append( "| {index} | 0x{table_offset:X} | 0x{data_offset:X} | 0x{declared_size:X} | {next_off} | {text_like} | {overlap} | {preview} |".format( index=chunk.index, table_offset=chunk.table_offset, data_offset=chunk.data_offset, declared_size=chunk.declared_size, next_off=next_off, text_like="yes" if chunk.text_like else "no", overlap="yes" if chunk.overlap_with_next else "no", preview=chunk.preview.replace("|", "/"), ) ) lines.append("") lines.append("## Notes") lines.append("") lines.append("- The extractor now parses the validated FLX table directly: entry count at `0x54`, table at `0x80`, 8 bytes per entry.") lines.append("- Overlapping declared sizes likely mean some entries are counts or record spans rather than exact chunk lengths.") lines.append("- `.strings.txt` files are the main human-readable output for now; `.txt` files are emitted only for chunks that look text-like.") lines.append("- `descriptor_index.tsv` summarizes guessed class labels, field names, and compact tag patterns for descriptor-like chunks.") lines.append("- `class_layout_index.tsv` records the conservative owner-loaded class parsing state: object index, class id, class-name hint, raw bytes-8..11 field, derived code-base-minus-one, and event-count/table-end values when the local divisibility and bounds checks succeed.") lines.append("- `class_event_index.tsv` now also emits derived body-window columns (`derived_body_start`, `derived_body_end`, `derived_body_length`) plus conservative `repeated_template_status` tags for verified repeated families.") lines.append("- `boot_family_decompile.md` / `.tsv`, `callback_family_decompile.md` / `.tsv`, and `environmental_family_decompile.md` / `.tsv` now provide reversible per-class decompile artifacts for the `_BOOT`, `SURCAM*`, and environmental repeated-family lanes.") lines.append("- `repeated_family_regressions.tsv` enforces the current repeated-family slot sets plus the verified raw-row and derived body-window fields for `JELYHACK/JELYH2`, `_BOOT`, `SURCAM*`, and `FLAMEBOX/NOSTRIL/STEAMBOX`.") lines.append("- `descriptor_neighborhoods.tsv` captures local table neighborhoods around trigger/event-related classes such as `JELYHACK`, `NPCTRIG`, `CRUZTRIG`, `TRIGPAD`, and `SPECIAL`.") lines.append("- `referent_anchor_event_graph.tsv` groups referent-bearing descriptors with nearby event-bearing neighbors so the attachment model can be inspected without ad hoc grepping.") lines.append("- `jelyhack_island_graph.md` now uses a wider local window so the `JELYHACK` / `JELYH2` anchors can be inspected alongside the nearby event-bearing `REE_BOOT`, `SURCAMEW`, and `SFXTRIG` descriptors rather than stopping at the referent-only neighbors.") lines.append("- `jelyhack_descriptor_compare.tsv` captures the first 16 header words, first 8 dwords, and a few odd printable markers for the core JELYHACK-island descriptors so structural similarity can be compared without raw hex dumps.") lines.append("- `event_island_graph.md` renders the denser `EVENT` / `COR_BOOT` / `NPCTRIG` / `ROLL_NS` / `CRUZTRIG` island, which currently looks like the strongest event-explicit neighborhood outside the JELYHACK anchor case.") lines.append("- `event_descriptor_compare.tsv` captures the same header-word and printable-marker comparison for the `EVENT` island so large event-bearing descriptors can be contrasted with neighboring trigger and referent records.") lines.append("- `boot_frontier_graph.md` renders the upstream referent neighborhood feeding `AND_BOOT` / `BRO_BOOT`, which is currently the clearest unexplored boot-event frontier.") lines.append("- `boot_family_compare.tsv` compares the five `_BOOT` event cores (`AND_BOOT`, `BRO_BOOT`, `COR_BOOT`, `VAR_BOOT`, `REE_BOOT`) by header words, markers, and field tags.") lines.append("- `environmental_event_graph.md` renders the three hazard/event islands centered on `FLAMEBOX`, `NOSTRIL`, and `STEAMBOX`, each surrounded by its own referent-heavy local neighborhood.") lines.append("- `environmental_family_compare.tsv` compares the environmental event trio so the shared hazard pattern (`referent,event,,,direction,count`) can be contrasted directly.") lines.append("- `callback_trigger_compare.tsv` compares `SURCAMNS` and `SURCAMEW` directly so the callback-only `eventTrigger` lane can be checked against the active `event` families without raw hex dumps.") lines.append("- `event_family_index.tsv` and `event_family_summary.md` classify all current `event` and `eventTrigger` descriptors into reusable families such as boot-event cores, minimal event cores, environmental events, and callback-only surveillance triggers.") lines.append("- `readable_descriptor_templates.md` and `readable_descriptor_templates.tsv` emit conservative pseudo-script sketches for the strongest current anchor, event-hub, environmental, and callback lanes so USECODE neighborhoods can be read as structured attachments instead of only raw descriptor rows.") lines.append("- `runtime_vm_ir.tsv` captures the currently verified 000d VM operator vocabulary as machine-readable rows with stage addresses, opcode/lane status, payload shape, and evidence anchors.") lines.append("- `vm_mask_ladder.tsv` records the current `entity_vm_context_try_create_masked_for_entity` wrapper ladder in machine-readable form so gameplay mask lanes can be compared against descriptor-side families without reopening the notes.") lines.append("- `readable_script_ir.md` and `readable_script_ir.tsv` join descriptor neighborhoods, the verified VM IR, the runtime owner/source path, and the current mask-family hints into one conservative script-facing bridge artifact.") lines.append("- `runtime_descriptor_family_rankings.md` and `runtime_descriptor_family_rankings.tsv` rank descriptor families against the verified runtime lanes so the current human-readable script bridge is searchable by family fit rather than only by neighborhood dumps.") (out_dir / "README.md").write_text("\n".join(lines) + "\n", encoding="utf-8") all_strings = iter_printable_runs(data) (out_dir / "all_strings.txt").write_text("\n".join(all_strings) + ("\n" if all_strings else ""), encoding="utf-8") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("input", nargs="?", type=pathlib.Path, default=DEFAULT_INPUT) parser.add_argument("output", nargs="?", type=pathlib.Path, default=DEFAULT_OUTPUT) return parser.parse_args() def main() -> int: args = parse_args() data = args.input.read_bytes() args.output.mkdir(parents=True, exist_ok=True) flx_table = parse_flx_table(data) entries = flx_table.entries chunks = extract_candidates(data, args.output, entries) write_summary(args.output, args.input, data, entries, chunks) print( f"Parsed {flx_table.entry_count} table slots with {len(chunks)} non-zero entries; extracted to {args.output}" ) return 0 if __name__ == "__main__": raise SystemExit(main())