Add extractor for Crusader's EUSECODE.FLX container
- Implemented a Python script to extract data from the EUSECODE.FLX file format. - Defined data structures for candidate entries and extracted chunks using dataclasses. - Added functions to read and parse the FLX table, extract candidate data, and generate human-readable output files. - Included functionality for analyzing extracted data, including generating summaries, descriptors, and event family reports. - Implemented utilities for calculating printable ratios, zero ratios, and identifying text-like data. - Added support for writing various output formats, including JSON, TSV, and Markdown.
This commit is contained in:
parent
3d4c4933ec
commit
3daffbf113
58 changed files with 30295 additions and 2504 deletions
788
tools/extract_eusecode_flx.py
Normal file
788
tools/extract_eusecode_flx.py
Normal file
|
|
@ -0,0 +1,788 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Extractor for Crusader's EUSECODE.FLX container.
|
||||
|
||||
Current validated layout:
|
||||
- 0x80-byte header area
|
||||
- little-endian entry count at file offset 0x54
|
||||
- entry table begins at 0x80
|
||||
- each entry is 8 bytes: <u32 data_offset, u32 declared_size>
|
||||
|
||||
The exact semantics of the payload records are still under RE, so the extractor dumps
|
||||
all non-zero entries and emits human-readable sidecars (.strings.txt and index files)
|
||||
to support the next decoding pass.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
import struct
|
||||
from dataclasses import asdict, dataclass
|
||||
|
||||
|
||||
DEFAULT_INPUT = pathlib.Path(r"k:\ghidra\Crusader_Decomp\USECODE\EUSECODE.FLX")
|
||||
DEFAULT_OUTPUT = pathlib.Path(r"k:\ghidra\Crusader_Decomp\USECODE\EUSECODE_extracted")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CandidateEntry:
|
||||
table_offset: int
|
||||
data_offset: int
|
||||
declared_size: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedChunk:
|
||||
index: int
|
||||
table_offset: int
|
||||
data_offset: int
|
||||
declared_size: int
|
||||
next_offset: int | None
|
||||
extracted_size: int
|
||||
overlap_with_next: bool
|
||||
text_like: bool
|
||||
printable_ratio: float
|
||||
zero_ratio: float
|
||||
preview: str
|
||||
raw_path: str
|
||||
strings_path: str
|
||||
text_path: str | None
|
||||
primary_label: str | None
|
||||
field_names: list[str]
|
||||
field_tags: list[str]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FlxTable:
|
||||
entry_count: int
|
||||
table_offset: int
|
||||
table_end: int
|
||||
entries: list[CandidateEntry]
|
||||
|
||||
|
||||
def read_u32_le(data: bytes, offset: int) -> int:
|
||||
return struct.unpack_from("<I", data, offset)[0]
|
||||
|
||||
|
||||
def read_u16_le(data: bytes, offset: int) -> int:
|
||||
return struct.unpack_from("<H", data, offset)[0]
|
||||
|
||||
|
||||
def ascii_preview(data: bytes, limit: int = 64) -> str:
|
||||
preview = []
|
||||
for byte in data[:limit]:
|
||||
if 0x20 <= byte <= 0x7E:
|
||||
preview.append(chr(byte))
|
||||
else:
|
||||
preview.append(".")
|
||||
return "".join(preview)
|
||||
|
||||
|
||||
def printable_ratio(data: bytes) -> float:
|
||||
if not data:
|
||||
return 0.0
|
||||
printable = sum(1 for byte in data if byte in (0x09, 0x0A, 0x0D) or 0x20 <= byte <= 0x7E)
|
||||
return printable / len(data)
|
||||
|
||||
|
||||
def zero_ratio(data: bytes) -> float:
|
||||
if not data:
|
||||
return 0.0
|
||||
return data.count(0) / len(data)
|
||||
|
||||
|
||||
def iter_printable_runs(data: bytes, min_len: int = 4) -> list[str]:
|
||||
runs: list[str] = []
|
||||
current = bytearray()
|
||||
for byte in data:
|
||||
if byte in (0x09, 0x0A, 0x0D) or 0x20 <= byte <= 0x7E:
|
||||
current.append(byte)
|
||||
continue
|
||||
if len(current) >= min_len:
|
||||
runs.append(current.decode("latin-1"))
|
||||
current.clear()
|
||||
if len(current) >= min_len:
|
||||
runs.append(current.decode("latin-1"))
|
||||
return runs
|
||||
|
||||
|
||||
def summarize_descriptor(strings: list[str]) -> tuple[str | None, list[str]]:
|
||||
label_counts: dict[str, int] = {}
|
||||
field_names: list[str] = []
|
||||
seen_fields: set[str] = set()
|
||||
|
||||
for value in strings:
|
||||
if value.isupper() and any(ch.isalpha() for ch in value):
|
||||
label_counts[value] = label_counts.get(value, 0) + 1
|
||||
continue
|
||||
if value and value[0].islower() and value.replace("_", "").isalnum() and value not in seen_fields:
|
||||
seen_fields.add(value)
|
||||
field_names.append(value)
|
||||
|
||||
primary_label = None
|
||||
if label_counts:
|
||||
primary_label = sorted(label_counts.items(), key=lambda item: (-item[1], item[0]))[0][0]
|
||||
return primary_label, field_names
|
||||
|
||||
|
||||
def extract_field_tag_records(data: bytes, field_names: list[str]) -> list[str]:
|
||||
tags: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for field_name in field_names:
|
||||
needle = field_name.encode("latin-1")
|
||||
start = 0
|
||||
while True:
|
||||
pos = data.find(needle, start)
|
||||
if pos < 3:
|
||||
break
|
||||
tag = f"{data[pos - 3]:02X}:{data[pos - 2]:02X}{data[pos - 1]:02X}->{field_name}"
|
||||
if tag not in seen:
|
||||
seen.add(tag)
|
||||
tags.append(tag)
|
||||
start = pos + 1
|
||||
|
||||
tags.sort()
|
||||
return tags
|
||||
|
||||
|
||||
def has_referent_field(chunk: ExtractedChunk) -> bool:
|
||||
if "referent" in chunk.field_names:
|
||||
return True
|
||||
return any(tag.endswith("->referent") for tag in chunk.field_tags)
|
||||
|
||||
|
||||
def get_event_evidence(chunk: ExtractedChunk) -> list[str]:
|
||||
evidence: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for field_name in chunk.field_names:
|
||||
if "event" not in field_name.lower():
|
||||
continue
|
||||
marker = f"field:{field_name}"
|
||||
if marker not in seen:
|
||||
seen.add(marker)
|
||||
evidence.append(marker)
|
||||
|
||||
for field_tag in chunk.field_tags:
|
||||
if "->event" not in field_tag.lower():
|
||||
continue
|
||||
marker = f"tag:{field_tag}"
|
||||
if marker not in seen:
|
||||
seen.add(marker)
|
||||
evidence.append(marker)
|
||||
|
||||
return evidence
|
||||
|
||||
|
||||
def chunk_role(chunk: ExtractedChunk) -> str:
|
||||
if chunk.primary_label in {"JELYHACK", "JELYH2"}:
|
||||
return "referent-anchor"
|
||||
if get_event_evidence(chunk):
|
||||
return "event-bearing"
|
||||
if has_referent_field(chunk):
|
||||
return "referent-neighbor"
|
||||
return "neighbor"
|
||||
|
||||
|
||||
def has_event_trigger_field(chunk: ExtractedChunk) -> bool:
|
||||
if any("eventtrigger" == field_name.lower() for field_name in chunk.field_names):
|
||||
return True
|
||||
return any("->eventtrigger" in field_tag.lower() for field_tag in chunk.field_tags)
|
||||
|
||||
|
||||
def event_tag_kind(chunk: ExtractedChunk) -> str:
|
||||
if any("->eventtrigger" in field_tag.lower() for field_tag in chunk.field_tags):
|
||||
return "eventTrigger"
|
||||
if any(field_tag.lower().endswith("->event") for field_tag in chunk.field_tags):
|
||||
return "event"
|
||||
return ""
|
||||
|
||||
|
||||
def classify_event_family(chunk: ExtractedChunk) -> str:
|
||||
if event_tag_kind(chunk) == "eventTrigger":
|
||||
return "callback-eventtrigger"
|
||||
if event_tag_kind(chunk) != "event":
|
||||
return ""
|
||||
if chunk.primary_label == "EVENT":
|
||||
return "event-hub"
|
||||
if chunk.primary_label and chunk.primary_label.endswith("_BOOT"):
|
||||
return "boot-event-core"
|
||||
if chunk.field_names == ["referent", "event"]:
|
||||
return "minimal-event-core"
|
||||
if any(name in chunk.field_names for name in ("flame", "flame2", "fire", "fire2", "steam", "steam2")):
|
||||
return "environmental-event"
|
||||
if "typeNpc" in chunk.field_names:
|
||||
return "npc-trigger"
|
||||
return "specialized-event"
|
||||
|
||||
|
||||
def header_u16_words(data: bytes, count: int = 16) -> list[str]:
|
||||
limit = min(len(data) // 2, count)
|
||||
return [f"0x{read_u16_le(data, index * 2):04X}" for index in range(limit)]
|
||||
|
||||
|
||||
def header_u32_words(data: bytes, count: int = 8) -> list[str]:
|
||||
limit = min(len(data) // 4, count)
|
||||
return [f"0x{read_u32_le(data, index * 4):08X}" for index in range(limit)]
|
||||
|
||||
|
||||
def interesting_printable_markers(data: bytes) -> list[str]:
|
||||
markers: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for run in iter_printable_runs(data, min_len=3):
|
||||
if not any(token in run for token in ("wx[", "wt$[", "t$t=t@", "$Q", "?\n", "?\r")):
|
||||
continue
|
||||
if run not in seen:
|
||||
seen.add(run)
|
||||
markers.append(run)
|
||||
return markers[:8]
|
||||
|
||||
|
||||
def write_island_graph(
|
||||
out_dir: pathlib.Path,
|
||||
output_name: str,
|
||||
title: str,
|
||||
center_labels: set[str],
|
||||
descriptor_chunks: list[ExtractedChunk],
|
||||
chunk_by_index: dict[int, ExtractedChunk],
|
||||
total_chunks: int,
|
||||
window: int = 5,
|
||||
) -> None:
|
||||
centers = [chunk for chunk in descriptor_chunks if chunk.primary_label in center_labels]
|
||||
if not centers:
|
||||
return
|
||||
|
||||
island_indices = sorted(
|
||||
{
|
||||
neighbor_index
|
||||
for center in centers
|
||||
for neighbor_index in range(max(0, center.index - window), min(total_chunks, center.index + window + 1))
|
||||
}
|
||||
)
|
||||
island_lines = [f"# {title}", "", "## Nodes", "", "| Index | Label | Role | Fields | Event Evidence |", "|---:|---|---|---|---|"]
|
||||
for index in island_indices:
|
||||
chunk = chunk_by_index[index]
|
||||
island_lines.append(
|
||||
"| {index} | {label} | {role} | {fields} | {evidence} |".format(
|
||||
index=index,
|
||||
label=chunk.primary_label or "",
|
||||
role=chunk_role(chunk),
|
||||
fields=",".join(chunk.field_names) or "-",
|
||||
evidence=",".join(get_event_evidence(chunk)) or "-",
|
||||
)
|
||||
)
|
||||
|
||||
island_lines.extend(["", "## Edges", "", "| Source | Relation | Target | Evidence |", "|---|---|---|---|"])
|
||||
for center in centers:
|
||||
for neighbor_index in range(max(0, center.index - window), min(total_chunks, center.index + window + 1)):
|
||||
if neighbor_index == center.index:
|
||||
continue
|
||||
neighbor = chunk_by_index[neighbor_index]
|
||||
relation = f"table-neighbor({neighbor.index - center.index:+d})"
|
||||
event_evidence = get_event_evidence(neighbor)
|
||||
if event_evidence:
|
||||
relation = f"possible-event-attachment({neighbor.index - center.index:+d})"
|
||||
island_lines.append(
|
||||
"| {source} ({source_index}) | {relation} | {target} ({target_index}) | {evidence} |".format(
|
||||
source=center.primary_label,
|
||||
source_index=center.index,
|
||||
relation=relation,
|
||||
target=neighbor.primary_label or "",
|
||||
target_index=neighbor.index,
|
||||
evidence=",".join(event_evidence) or "same local extraction neighborhood",
|
||||
)
|
||||
)
|
||||
|
||||
(out_dir / output_name).write_text("\n".join(island_lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def write_descriptor_compare(
|
||||
out_dir: pathlib.Path,
|
||||
output_name: str,
|
||||
labels: set[str],
|
||||
descriptor_chunks: list[ExtractedChunk],
|
||||
) -> None:
|
||||
compare_lines = [
|
||||
"entry_index\tlabel\trole\tdata_offset\tdeclared_size\theader_u16\theader_u32\tprintable_markers\tfield_tags"
|
||||
]
|
||||
for chunk in descriptor_chunks:
|
||||
if chunk.primary_label not in labels:
|
||||
continue
|
||||
raw_data = pathlib.Path(chunk.raw_path).read_bytes()
|
||||
compare_lines.append(
|
||||
"{index}\t{label}\t{role}\t0x{data_offset:X}\t0x{declared_size:X}\t{header_u16}\t{header_u32}\t{markers}\t{field_tags}".format(
|
||||
index=chunk.index,
|
||||
label=chunk.primary_label,
|
||||
role=chunk_role(chunk),
|
||||
data_offset=chunk.data_offset,
|
||||
declared_size=chunk.declared_size,
|
||||
header_u16=",".join(header_u16_words(raw_data)),
|
||||
header_u32=",".join(header_u32_words(raw_data)),
|
||||
markers="|".join(interesting_printable_markers(raw_data)),
|
||||
field_tags=",".join(chunk.field_tags),
|
||||
)
|
||||
)
|
||||
(out_dir / output_name).write_text("\n".join(compare_lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def write_event_family_reports(
|
||||
out_dir: pathlib.Path,
|
||||
descriptor_chunks: list[ExtractedChunk],
|
||||
chunk_by_index: dict[int, ExtractedChunk],
|
||||
total_chunks: int,
|
||||
) -> None:
|
||||
family_lines = [
|
||||
"entry_index\tlabel\tfamily\ttag_kind\trole\tfield_count\tfield_names\tfield_tags\tdata_offset\tdeclared_size\tlocal_event_neighbors"
|
||||
]
|
||||
families: dict[str, list[ExtractedChunk]] = {}
|
||||
|
||||
for chunk in descriptor_chunks:
|
||||
family = classify_event_family(chunk)
|
||||
if not family:
|
||||
continue
|
||||
families.setdefault(family, []).append(chunk)
|
||||
local_event_neighbors = 0
|
||||
for neighbor_index in range(max(0, chunk.index - 5), min(total_chunks, chunk.index + 6)):
|
||||
if neighbor_index == chunk.index:
|
||||
continue
|
||||
neighbor = chunk_by_index[neighbor_index]
|
||||
if event_tag_kind(neighbor):
|
||||
local_event_neighbors += 1
|
||||
family_lines.append(
|
||||
"{index}\t{label}\t{family}\t{tag_kind}\t{role}\t{field_count}\t{field_names}\t{field_tags}\t0x{data_offset:X}\t0x{declared_size:X}\t{local_event_neighbors}".format(
|
||||
index=chunk.index,
|
||||
label=chunk.primary_label or "",
|
||||
family=family,
|
||||
tag_kind=event_tag_kind(chunk),
|
||||
role=chunk_role(chunk),
|
||||
field_count=len(chunk.field_names),
|
||||
field_names=",".join(chunk.field_names),
|
||||
field_tags=",".join(chunk.field_tags),
|
||||
data_offset=chunk.data_offset,
|
||||
declared_size=chunk.declared_size,
|
||||
local_event_neighbors=local_event_neighbors,
|
||||
)
|
||||
)
|
||||
(out_dir / "event_family_index.tsv").write_text("\n".join(family_lines) + "\n", encoding="utf-8")
|
||||
|
||||
summary_lines = ["# Event Family Summary", ""]
|
||||
family_order = [
|
||||
"event-hub",
|
||||
"boot-event-core",
|
||||
"npc-trigger",
|
||||
"minimal-event-core",
|
||||
"environmental-event",
|
||||
"specialized-event",
|
||||
"callback-eventtrigger",
|
||||
]
|
||||
for family in family_order:
|
||||
family_chunks = families.get(family, [])
|
||||
if not family_chunks:
|
||||
continue
|
||||
summary_lines.append(f"## {family}")
|
||||
summary_lines.append("")
|
||||
summary_lines.append("| Index | Label | Tag Kind | Fields | Size | Local Event Neighbors |")
|
||||
summary_lines.append("|---:|---|---|---|---:|---:|")
|
||||
for chunk in sorted(family_chunks, key=lambda value: value.index):
|
||||
local_event_neighbors = 0
|
||||
for neighbor_index in range(max(0, chunk.index - 5), min(total_chunks, chunk.index + 6)):
|
||||
if neighbor_index == chunk.index:
|
||||
continue
|
||||
neighbor = chunk_by_index[neighbor_index]
|
||||
if event_tag_kind(neighbor):
|
||||
local_event_neighbors += 1
|
||||
summary_lines.append(
|
||||
"| {index} | {label} | {tag_kind} | {fields} | 0x{declared_size:X} | {local_event_neighbors} |".format(
|
||||
index=chunk.index,
|
||||
label=chunk.primary_label or "",
|
||||
tag_kind=event_tag_kind(chunk),
|
||||
fields=",".join(chunk.field_names) or "-",
|
||||
declared_size=chunk.declared_size,
|
||||
local_event_neighbors=local_event_neighbors,
|
||||
)
|
||||
)
|
||||
summary_lines.append("")
|
||||
(out_dir / "event_family_summary.md").write_text("\n".join(summary_lines), encoding="utf-8")
|
||||
|
||||
|
||||
def looks_text_like(data: bytes) -> bool:
|
||||
if not data:
|
||||
return False
|
||||
ratio = printable_ratio(data)
|
||||
if ratio < 0.80:
|
||||
return False
|
||||
if b"\r\n" in data or b"\n" in data:
|
||||
return True
|
||||
return zero_ratio(data) < 0.05
|
||||
|
||||
|
||||
def parse_flx_table(data: bytes, table_offset: int = 0x80, count_offset: int = 0x54) -> FlxTable:
|
||||
file_size = len(data)
|
||||
entry_count = read_u32_le(data, count_offset)
|
||||
table_end = table_offset + entry_count * 8
|
||||
if table_end > file_size:
|
||||
raise ValueError(
|
||||
f"FLX table extends past EOF: entry_count={entry_count} table_end=0x{table_end:X} file_size=0x{file_size:X}"
|
||||
)
|
||||
|
||||
entries: list[CandidateEntry] = []
|
||||
for index in range(entry_count):
|
||||
offset = table_offset + index * 8
|
||||
data_offset = read_u32_le(data, offset)
|
||||
declared_size = read_u32_le(data, offset + 4)
|
||||
if data_offset == 0 and declared_size == 0:
|
||||
continue
|
||||
if data_offset <= 0 or data_offset > file_size:
|
||||
continue
|
||||
if declared_size <= 0:
|
||||
continue
|
||||
entries.append(CandidateEntry(offset, data_offset, declared_size))
|
||||
|
||||
return FlxTable(
|
||||
entry_count=entry_count,
|
||||
table_offset=table_offset,
|
||||
table_end=table_end,
|
||||
entries=entries,
|
||||
)
|
||||
|
||||
|
||||
def dump_chunk(
|
||||
base_dir: pathlib.Path, chunk_name: str, data: bytes
|
||||
) -> tuple[str, str, str | None, bool, float, float, str, str | None, list[str], list[str]]:
|
||||
raw_path = base_dir / f"{chunk_name}.bin"
|
||||
strings_path = base_dir / f"{chunk_name}.strings.txt"
|
||||
text_path = base_dir / f"{chunk_name}.txt"
|
||||
|
||||
raw_path.write_bytes(data)
|
||||
|
||||
runs = iter_printable_runs(data)
|
||||
strings_path.write_text("\n".join(runs) + ("\n" if runs else ""), encoding="utf-8")
|
||||
primary_label, field_names = summarize_descriptor(runs)
|
||||
field_tags = extract_field_tag_records(data, field_names)
|
||||
|
||||
text_like = looks_text_like(data)
|
||||
actual_text_path: str | None = None
|
||||
if text_like:
|
||||
text_path.write_text(data.decode("latin-1", errors="replace"), encoding="utf-8")
|
||||
actual_text_path = str(text_path)
|
||||
|
||||
return (
|
||||
str(raw_path),
|
||||
str(strings_path),
|
||||
actual_text_path,
|
||||
text_like,
|
||||
printable_ratio(data),
|
||||
zero_ratio(data),
|
||||
ascii_preview(data),
|
||||
primary_label,
|
||||
field_names,
|
||||
field_tags,
|
||||
)
|
||||
|
||||
|
||||
def extract_candidates(data: bytes, out_dir: pathlib.Path, entries: list[CandidateEntry]) -> list[ExtractedChunk]:
|
||||
chunks_dir = out_dir / "chunks"
|
||||
chunks_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
extracted: list[ExtractedChunk] = []
|
||||
file_size = len(data)
|
||||
|
||||
sorted_entries = sorted(enumerate(entries), key=lambda item: (item[1].data_offset, item[0]))
|
||||
next_by_original_index: dict[int, int | None] = {}
|
||||
for position, (original_index, entry) in enumerate(sorted_entries):
|
||||
next_offset = sorted_entries[position + 1][1].data_offset if position + 1 < len(sorted_entries) else None
|
||||
next_by_original_index[original_index] = next_offset
|
||||
|
||||
for index, entry in enumerate(entries):
|
||||
next_offset = next_by_original_index.get(index)
|
||||
chunk_end = min(file_size, entry.data_offset + entry.declared_size)
|
||||
chunk_data = data[entry.data_offset:chunk_end]
|
||||
overlap = next_offset is not None and (entry.data_offset + entry.declared_size) > next_offset
|
||||
chunk_name = (
|
||||
f"chunk_{index:03d}_table_{entry.table_offset:04X}_off_{entry.data_offset:06X}_len_{entry.declared_size:06X}"
|
||||
)
|
||||
|
||||
raw_path, strings_path, text_path, text_like, print_ratio, z_ratio, preview, primary_label, field_names, field_tags = dump_chunk(
|
||||
chunks_dir, chunk_name, chunk_data
|
||||
)
|
||||
|
||||
extracted.append(
|
||||
ExtractedChunk(
|
||||
index=index,
|
||||
table_offset=entry.table_offset,
|
||||
data_offset=entry.data_offset,
|
||||
declared_size=entry.declared_size,
|
||||
next_offset=next_offset,
|
||||
extracted_size=len(chunk_data),
|
||||
overlap_with_next=overlap,
|
||||
text_like=text_like,
|
||||
printable_ratio=round(print_ratio, 4),
|
||||
zero_ratio=round(z_ratio, 4),
|
||||
preview=preview,
|
||||
raw_path=raw_path,
|
||||
strings_path=strings_path,
|
||||
text_path=text_path,
|
||||
primary_label=primary_label,
|
||||
field_names=field_names,
|
||||
field_tags=field_tags,
|
||||
)
|
||||
)
|
||||
|
||||
return extracted
|
||||
|
||||
|
||||
def write_summary(out_dir: pathlib.Path, input_path: pathlib.Path, data: bytes, entries: list[CandidateEntry], chunks: list[ExtractedChunk]) -> None:
|
||||
summary = {
|
||||
"input_path": str(input_path),
|
||||
"file_size": len(data),
|
||||
"header_preview_hex": data[:128].hex(),
|
||||
"header_preview_ascii": ascii_preview(data[:128], 128),
|
||||
"candidate_entries": [asdict(entry) for entry in entries],
|
||||
"chunks": [asdict(chunk) for chunk in chunks],
|
||||
}
|
||||
(out_dir / "summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
||||
|
||||
index_lines = [
|
||||
"entry_index\ttable_offset\tdata_offset\tdeclared_size\textracted_size\ttext_like\tprintable_ratio\tzero_ratio\toverlap_with_next\tprimary_label\tfield_names\tfield_tags\tpreview"
|
||||
]
|
||||
for chunk in chunks:
|
||||
index_lines.append(
|
||||
"{index}\t0x{table_offset:X}\t0x{data_offset:X}\t0x{declared_size:X}\t0x{extracted_size:X}\t{text_like}\t{printable_ratio:.4f}\t{zero_ratio:.4f}\t{overlap}\t{primary_label}\t{field_names}\t{field_tags}\t{preview}".format(
|
||||
index=chunk.index,
|
||||
table_offset=chunk.table_offset,
|
||||
data_offset=chunk.data_offset,
|
||||
declared_size=chunk.declared_size,
|
||||
extracted_size=chunk.extracted_size,
|
||||
text_like=int(chunk.text_like),
|
||||
printable_ratio=chunk.printable_ratio,
|
||||
zero_ratio=chunk.zero_ratio,
|
||||
overlap=int(chunk.overlap_with_next),
|
||||
primary_label=chunk.primary_label or "",
|
||||
field_names=",".join(chunk.field_names),
|
||||
field_tags=",".join(chunk.field_tags),
|
||||
preview=chunk.preview.replace("\t", " "),
|
||||
)
|
||||
)
|
||||
(out_dir / "entry_index.tsv").write_text("\n".join(index_lines) + "\n", encoding="utf-8")
|
||||
|
||||
descriptor_lines = [
|
||||
"entry_index\tprimary_label\tfield_names\tfield_tags\tdata_offset\tdeclared_size"
|
||||
]
|
||||
descriptor_chunks = [chunk for chunk in chunks if chunk.primary_label or chunk.field_names]
|
||||
for chunk in descriptor_chunks:
|
||||
descriptor_lines.append(
|
||||
"{index}\t{primary_label}\t{field_names}\t{field_tags}\t0x{data_offset:X}\t0x{declared_size:X}".format(
|
||||
index=chunk.index,
|
||||
primary_label=chunk.primary_label or "",
|
||||
field_names=",".join(chunk.field_names),
|
||||
field_tags=",".join(chunk.field_tags),
|
||||
data_offset=chunk.data_offset,
|
||||
declared_size=chunk.declared_size,
|
||||
)
|
||||
)
|
||||
(out_dir / "descriptor_index.tsv").write_text("\n".join(descriptor_lines) + "\n", encoding="utf-8")
|
||||
|
||||
neighborhood_lines = [
|
||||
"center_index\tneighbor_index\tprimary_label\tfield_names\tfield_tags"
|
||||
]
|
||||
interesting = {"JELYHACK", "JELYH2", "NPCTRIG", "CRUZTRIG", "TRIGPAD", "SPECIAL", "EVENT", "SFXTRIG"}
|
||||
interesting_indices = [chunk.index for chunk in chunks if chunk.primary_label in interesting]
|
||||
seen_pairs: set[tuple[int, int]] = set()
|
||||
chunk_by_index = {chunk.index: chunk for chunk in chunks}
|
||||
for center_index in interesting_indices:
|
||||
for neighbor_index in range(max(0, center_index - 4), min(len(chunks), center_index + 5)):
|
||||
pair = (center_index, neighbor_index)
|
||||
if pair in seen_pairs:
|
||||
continue
|
||||
seen_pairs.add(pair)
|
||||
chunk = chunk_by_index[neighbor_index]
|
||||
neighborhood_lines.append(
|
||||
"{center_index}\t{neighbor_index}\t{primary_label}\t{field_names}\t{field_tags}".format(
|
||||
center_index=center_index,
|
||||
neighbor_index=neighbor_index,
|
||||
primary_label=chunk.primary_label or "",
|
||||
field_names=",".join(chunk.field_names),
|
||||
field_tags=",".join(chunk.field_tags),
|
||||
)
|
||||
)
|
||||
(out_dir / "descriptor_neighborhoods.tsv").write_text("\n".join(neighborhood_lines) + "\n", encoding="utf-8")
|
||||
|
||||
anchor_graph_lines = [
|
||||
"anchor_index\tanchor_label\tanchor_fields\tneighbor_index\tdistance\tneighbor_label\tneighbor_fields\tneighbor_role\tevent_evidence"
|
||||
]
|
||||
for anchor in descriptor_chunks:
|
||||
if not anchor.primary_label or not has_referent_field(anchor):
|
||||
continue
|
||||
for neighbor_index in range(max(0, anchor.index - 5), min(len(chunks), anchor.index + 6)):
|
||||
if neighbor_index == anchor.index:
|
||||
continue
|
||||
neighbor = chunk_by_index[neighbor_index]
|
||||
event_evidence = get_event_evidence(neighbor)
|
||||
if not event_evidence:
|
||||
continue
|
||||
anchor_graph_lines.append(
|
||||
"{anchor_index}\t{anchor_label}\t{anchor_fields}\t{neighbor_index}\t{distance:+d}\t{neighbor_label}\t{neighbor_fields}\t{neighbor_role}\t{event_evidence}".format(
|
||||
anchor_index=anchor.index,
|
||||
anchor_label=anchor.primary_label,
|
||||
anchor_fields=",".join(anchor.field_names),
|
||||
neighbor_index=neighbor.index,
|
||||
distance=neighbor.index - anchor.index,
|
||||
neighbor_label=neighbor.primary_label or "",
|
||||
neighbor_fields=",".join(neighbor.field_names),
|
||||
neighbor_role=chunk_role(neighbor),
|
||||
event_evidence=",".join(event_evidence),
|
||||
)
|
||||
)
|
||||
(out_dir / "referent_anchor_event_graph.tsv").write_text("\n".join(anchor_graph_lines) + "\n", encoding="utf-8")
|
||||
|
||||
write_island_graph(
|
||||
out_dir,
|
||||
"jelyhack_island_graph.md",
|
||||
"JELYHACK Island Graph",
|
||||
{"JELYHACK", "JELYH2"},
|
||||
descriptor_chunks,
|
||||
chunk_by_index,
|
||||
len(chunks),
|
||||
)
|
||||
write_descriptor_compare(
|
||||
out_dir,
|
||||
"jelyhack_descriptor_compare.tsv",
|
||||
{"JELYHACK", "JELYH2", "REE_BOOT", "SURCAMEW", "SFXTRIG"},
|
||||
descriptor_chunks,
|
||||
)
|
||||
write_island_graph(
|
||||
out_dir,
|
||||
"event_island_graph.md",
|
||||
"EVENT Cluster Graph",
|
||||
{"EVENT", "COR_BOOT", "NPCTRIG", "ROLL_NS", "CRUZTRIG"},
|
||||
descriptor_chunks,
|
||||
chunk_by_index,
|
||||
len(chunks),
|
||||
)
|
||||
write_descriptor_compare(
|
||||
out_dir,
|
||||
"event_descriptor_compare.tsv",
|
||||
{"ROLL_NS", "COR_BOOT", "EVENT", "NPCTRIG", "CRUZTRIG", "NPC_ONLY", "VMAIL"},
|
||||
descriptor_chunks,
|
||||
)
|
||||
write_island_graph(
|
||||
out_dir,
|
||||
"boot_frontier_graph.md",
|
||||
"AND/BRO Boot Frontier Graph",
|
||||
{"AND_BOOT", "BRO_BOOT"},
|
||||
descriptor_chunks,
|
||||
chunk_by_index,
|
||||
len(chunks),
|
||||
window=6,
|
||||
)
|
||||
write_descriptor_compare(
|
||||
out_dir,
|
||||
"boot_family_compare.tsv",
|
||||
{"AND_BOOT", "BRO_BOOT", "COR_BOOT", "VAR_BOOT", "REE_BOOT"},
|
||||
descriptor_chunks,
|
||||
)
|
||||
write_island_graph(
|
||||
out_dir,
|
||||
"environmental_event_graph.md",
|
||||
"Environmental Event Graph",
|
||||
{"FLAMEBOX", "NOSTRIL", "STEAMBOX"},
|
||||
descriptor_chunks,
|
||||
chunk_by_index,
|
||||
len(chunks),
|
||||
window=5,
|
||||
)
|
||||
write_descriptor_compare(
|
||||
out_dir,
|
||||
"environmental_family_compare.tsv",
|
||||
{"FLAMEBOX", "NOSTRIL", "STEAMBOX"},
|
||||
descriptor_chunks,
|
||||
)
|
||||
write_descriptor_compare(
|
||||
out_dir,
|
||||
"callback_trigger_compare.tsv",
|
||||
{"SURCAMNS", "SURCAMEW"},
|
||||
descriptor_chunks,
|
||||
)
|
||||
write_event_family_reports(out_dir, descriptor_chunks, chunk_by_index, len(chunks))
|
||||
|
||||
lines = []
|
||||
lines.append("# EUSECODE.FLX First-Pass Extraction")
|
||||
lines.append("")
|
||||
lines.append(f"Input: {input_path}")
|
||||
lines.append(f"File size: 0x{len(data):X} ({len(data)} bytes)")
|
||||
lines.append(f"Candidate entries: {len(entries)}")
|
||||
lines.append("")
|
||||
lines.append("## Header Preview")
|
||||
lines.append("")
|
||||
lines.append(f"ASCII: `{ascii_preview(data[:128], 128)}`")
|
||||
lines.append("")
|
||||
lines.append("## Chunks")
|
||||
lines.append("")
|
||||
lines.append("| # | Table Off | Data Off | Declared Size | Next Off | Text | Overlap | Preview |")
|
||||
lines.append("|---:|---:|---:|---:|---:|:---:|:---:|---|")
|
||||
for chunk in chunks:
|
||||
next_off = f"0x{chunk.next_offset:X}" if chunk.next_offset is not None else "-"
|
||||
lines.append(
|
||||
"| {index} | 0x{table_offset:X} | 0x{data_offset:X} | 0x{declared_size:X} | {next_off} | {text_like} | {overlap} | {preview} |".format(
|
||||
index=chunk.index,
|
||||
table_offset=chunk.table_offset,
|
||||
data_offset=chunk.data_offset,
|
||||
declared_size=chunk.declared_size,
|
||||
next_off=next_off,
|
||||
text_like="yes" if chunk.text_like else "no",
|
||||
overlap="yes" if chunk.overlap_with_next else "no",
|
||||
preview=chunk.preview.replace("|", "/"),
|
||||
)
|
||||
)
|
||||
lines.append("")
|
||||
lines.append("## Notes")
|
||||
lines.append("")
|
||||
lines.append("- The extractor now parses the validated FLX table directly: entry count at `0x54`, table at `0x80`, 8 bytes per entry.")
|
||||
lines.append("- Overlapping declared sizes likely mean some entries are counts or record spans rather than exact chunk lengths.")
|
||||
lines.append("- `.strings.txt` files are the main human-readable output for now; `.txt` files are emitted only for chunks that look text-like.")
|
||||
lines.append("- `descriptor_index.tsv` summarizes guessed class labels, field names, and compact tag patterns for descriptor-like chunks.")
|
||||
lines.append("- `descriptor_neighborhoods.tsv` captures local table neighborhoods around trigger/event-related classes such as `JELYHACK`, `NPCTRIG`, `CRUZTRIG`, `TRIGPAD`, and `SPECIAL`.")
|
||||
lines.append("- `referent_anchor_event_graph.tsv` groups referent-bearing descriptors with nearby event-bearing neighbors so the attachment model can be inspected without ad hoc grepping.")
|
||||
lines.append("- `jelyhack_island_graph.md` renders the first focused graph view for the `JELYHACK` / `JELYH2` neighborhood, marking likely event-bearing attachments such as `REE_BOOT`, `SURCAMEW`, and `SFXTRIG` when they appear within the local table window.")
|
||||
lines.append("- `jelyhack_descriptor_compare.tsv` captures the first 16 header words, first 8 dwords, and a few odd printable markers for the core JELYHACK-island descriptors so structural similarity can be compared without raw hex dumps.")
|
||||
lines.append("- `event_island_graph.md` renders the denser `EVENT` / `COR_BOOT` / `NPCTRIG` / `ROLL_NS` / `CRUZTRIG` island, which currently looks like the strongest event-explicit neighborhood outside the JELYHACK anchor case.")
|
||||
lines.append("- `event_descriptor_compare.tsv` captures the same header-word and printable-marker comparison for the `EVENT` island so large event-bearing descriptors can be contrasted with neighboring trigger and referent records.")
|
||||
lines.append("- `boot_frontier_graph.md` renders the upstream referent neighborhood feeding `AND_BOOT` / `BRO_BOOT`, which is currently the clearest unexplored boot-event frontier.")
|
||||
lines.append("- `boot_family_compare.tsv` compares the five `_BOOT` event cores (`AND_BOOT`, `BRO_BOOT`, `COR_BOOT`, `VAR_BOOT`, `REE_BOOT`) by header words, markers, and field tags.")
|
||||
lines.append("- `environmental_event_graph.md` renders the three hazard/event islands centered on `FLAMEBOX`, `NOSTRIL`, and `STEAMBOX`, each surrounded by its own referent-heavy local neighborhood.")
|
||||
lines.append("- `environmental_family_compare.tsv` compares the environmental event trio so the shared hazard pattern (`referent,event,<hazard>,<hazard2>,direction,count`) can be contrasted directly.")
|
||||
lines.append("- `callback_trigger_compare.tsv` compares `SURCAMNS` and `SURCAMEW` directly so the callback-only `eventTrigger` lane can be checked against the active `event` families without raw hex dumps.")
|
||||
lines.append("- `event_family_index.tsv` and `event_family_summary.md` classify all current `event` and `eventTrigger` descriptors into reusable families such as boot-event cores, minimal event cores, environmental events, and callback-only surveillance triggers.")
|
||||
(out_dir / "README.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
all_strings = iter_printable_runs(data)
|
||||
(out_dir / "all_strings.txt").write_text("\n".join(all_strings) + ("\n" if all_strings else ""), encoding="utf-8")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("input", nargs="?", type=pathlib.Path, default=DEFAULT_INPUT)
|
||||
parser.add_argument("output", nargs="?", type=pathlib.Path, default=DEFAULT_OUTPUT)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
data = args.input.read_bytes()
|
||||
args.output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
flx_table = parse_flx_table(data)
|
||||
entries = flx_table.entries
|
||||
chunks = extract_candidates(data, args.output, entries)
|
||||
write_summary(args.output, args.input, data, entries, chunks)
|
||||
|
||||
print(
|
||||
f"Parsed {flx_table.entry_count} table slots with {len(chunks)} non-zero entries; extracted to {args.output}"
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue