- Enhance `extract_eusecode_flx.py` to derive class event rows with additional metadata including derived body windows and repeated template statuses. - Introduce `usecode_family_compare.py` for comparing event families, analyzing commonalities in event bodies, and generating reports on identical groups and differences. - Implement new data structures for managing class event rows and family artifact specifications. - Update output formats to include derived body information and repeated family regression checks. - Ensure robust validation of repeated family expectations against actual extracted data.
165 lines
No EOL
5.9 KiB
Python
165 lines
No EOL
5.9 KiB
Python
import csv
|
|
import glob
|
|
import hashlib
|
|
import json
|
|
from pathlib import Path
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
LAYOUT_PATH = ROOT / "USECODE" / "EUSECODE_extracted" / "class_layout_index.tsv"
|
|
EVENT_PATH = ROOT / "USECODE" / "EUSECODE_extracted" / "class_event_index.tsv"
|
|
CHUNKS_DIR = ROOT / "USECODE" / "EUSECODE_extracted" / "chunks"
|
|
|
|
FAMILIES = {
|
|
"BOOT": {
|
|
"classes": ["AND_BOOT", "BRO_BOOT", "COR_BOOT", "REE_BOOT", "VAR_BOOT"],
|
|
"slots": [0x0A, 0x0F, 0x10],
|
|
},
|
|
"SURCAM": {
|
|
"classes": ["SURCAMNS", "SURCAMEW"],
|
|
"slots": [0x20, 0x21, 0x22],
|
|
},
|
|
"JELY": {
|
|
"classes": ["JELYHACK", "JELYH2"],
|
|
"slots": [0x01],
|
|
},
|
|
}
|
|
|
|
|
|
def parse_hex(value: str) -> int:
|
|
return int(value, 16)
|
|
|
|
|
|
def common_prefix_length(blobs: list[bytes]) -> int:
|
|
if not blobs:
|
|
return 0
|
|
limit = min(len(blob) for blob in blobs)
|
|
for index in range(limit):
|
|
current = blobs[0][index]
|
|
if any(blob[index] != current for blob in blobs[1:]):
|
|
return index
|
|
return limit
|
|
|
|
|
|
def common_suffix_length(blobs: list[bytes]) -> int:
|
|
return common_prefix_length([blob[::-1] for blob in blobs])
|
|
|
|
|
|
def first_diff_positions(blobs: list[bytes], limit: int = 8) -> list[int]:
|
|
positions: list[int] = []
|
|
max_len = max(len(blob) for blob in blobs)
|
|
for index in range(max_len):
|
|
values = {blob[index] if index < len(blob) else None for blob in blobs}
|
|
if len(values) > 1:
|
|
positions.append(index)
|
|
if len(positions) >= limit:
|
|
break
|
|
return positions
|
|
|
|
|
|
def load_layouts(targets: set[str]) -> dict[str, dict[str, str]]:
|
|
layouts: dict[str, dict[str, str]] = {}
|
|
with LAYOUT_PATH.open("r", encoding="utf-8", newline="") as handle:
|
|
reader = csv.DictReader(handle, delimiter="\t")
|
|
for row in reader:
|
|
if row["class_name_hint"] in targets:
|
|
layouts[row["class_name_hint"]] = row
|
|
return layouts
|
|
|
|
|
|
def load_events(targets: set[str]) -> dict[str, list[dict[str, str]]]:
|
|
events: dict[str, list[dict[str, str]]] = {}
|
|
with EVENT_PATH.open("r", encoding="utf-8", newline="") as handle:
|
|
reader = csv.DictReader(handle, delimiter="\t")
|
|
for row in reader:
|
|
if row["class_name_hint"] in targets:
|
|
events.setdefault(row["class_name_hint"], []).append(row)
|
|
for rows in events.values():
|
|
rows.sort(key=lambda row: parse_hex(row["slot"]))
|
|
return events
|
|
|
|
|
|
def resolve_chunk(data_offset: int) -> Path:
|
|
matches = glob.glob(str(CHUNKS_DIR / f"chunk_*_off_{data_offset:06X}_len_*.bin"))
|
|
if len(matches) != 1:
|
|
raise RuntimeError(f"chunk lookup failed for 0x{data_offset:06X}: {matches}")
|
|
return Path(matches[0])
|
|
|
|
|
|
def build_rows() -> dict[tuple[str, int], dict[str, object]]:
|
|
targets = {name for family in FAMILIES.values() for name in family["classes"]}
|
|
layouts = load_layouts(targets)
|
|
events = load_events(targets)
|
|
rows_by_key: dict[tuple[str, int], dict[str, object]] = {}
|
|
|
|
for class_name, layout in layouts.items():
|
|
chunk_path = resolve_chunk(parse_hex(layout["data_offset"]))
|
|
blob = chunk_path.read_bytes()
|
|
code_base_minus_one = parse_hex(layout["code_base_minus_one"])
|
|
nonzero_rows = [row for row in events[class_name] if parse_hex(row["raw_code_offset"]) != 0]
|
|
offsets = sorted({parse_hex(row["raw_code_offset"]) for row in nonzero_rows})
|
|
|
|
for row in nonzero_rows:
|
|
slot = parse_hex(row["slot"])
|
|
code_offset = parse_hex(row["raw_code_offset"])
|
|
start = code_base_minus_one + code_offset
|
|
next_offsets = [offset for offset in offsets if offset > code_offset]
|
|
end = code_base_minus_one + next_offsets[0] if next_offsets else len(blob)
|
|
body = blob[start:end]
|
|
rows_by_key[(class_name, slot)] = {
|
|
"class_name": class_name,
|
|
"slot": slot,
|
|
"event_name_hint": row["event_name_hint"],
|
|
"raw_event_entry_word": row["raw_event_entry_word"],
|
|
"raw_code_offset": row["raw_code_offset"],
|
|
"start": start,
|
|
"end": end,
|
|
"length": len(body),
|
|
"sha1": hashlib.sha1(body).hexdigest(),
|
|
"preview": body[:16].hex(" "),
|
|
"chunk_path": str(chunk_path).replace("\\", "/"),
|
|
"body": body,
|
|
}
|
|
|
|
return rows_by_key
|
|
|
|
|
|
def main() -> None:
|
|
rows_by_key = build_rows()
|
|
for family_name, family in FAMILIES.items():
|
|
print(f"## {family_name}")
|
|
for slot in family["slots"]:
|
|
print(f"SLOT 0x{slot:02X}")
|
|
subset = [rows_by_key[(class_name, slot)] for class_name in family["classes"]]
|
|
for row in subset:
|
|
print(
|
|
"\t".join(
|
|
[
|
|
str(row["class_name"]),
|
|
str(row["event_name_hint"]),
|
|
str(row["raw_event_entry_word"]),
|
|
str(row["raw_code_offset"]),
|
|
f"{row['start']:04X}-{row['end']:04X}",
|
|
str(row["length"]),
|
|
str(row["sha1"])[:12],
|
|
str(row["preview"]),
|
|
str(row["chunk_path"]),
|
|
]
|
|
)
|
|
)
|
|
|
|
groups: dict[str, list[str]] = {}
|
|
for row in subset:
|
|
groups.setdefault(str(row["sha1"]), []).append(str(row["class_name"]))
|
|
blobs = [row["body"] for row in subset]
|
|
print("identical_groups=" + json.dumps(list(groups.values())))
|
|
print(
|
|
f"common_prefix_len={common_prefix_length(blobs)} "
|
|
f"common_suffix_len={common_suffix_length(blobs)}"
|
|
)
|
|
print("first_diff_positions=" + json.dumps(first_diff_positions(blobs)))
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |