diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2ce76c4..955e9b7 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -76,6 +76,15 @@ For broker-level MQTT investigations in this workspace, first check for a local - If the file is missing, incomplete, or clearly stale for the requested task, prompt the user to update it before attempting deeper MQTT diagnostics. - Treat the file as local machine state, not repository configuration. +## Repository MQTT/Zigbee Diagnostics Tool + +For broker-level MQTT and Zigbee2MQTT diagnostics in this repository, prefer the reusable script at `scripts\mqtt_z2m_diag.py` over one-off session scripts. + +- Use the script for retained state checks, `get` requests, `device/configure` probes, permit-join flows, pair watches, raw topic watches, and side-by-side device comparisons. +- Prefer read-only script commands first unless the investigation explicitly needs a write-like action such as `configure`, `permit-join`, or raw publish. +- Keep `scripts\mqtt_z2m_diag.py` updated as new recurring diagnostics needs appear in this workspace. +- Keep the skill at `.github\skills\mqtt-z2m-diagnostics\SKILL.md` aligned with the script as it evolves. + ## Usage Examples - If the user says `turn on the kitchen light`, ask which instance. diff --git a/.github/skills/mqtt-z2m-diagnostics/SKILL.md b/.github/skills/mqtt-z2m-diagnostics/SKILL.md new file mode 100644 index 0000000..305bbbe --- /dev/null +++ b/.github/skills/mqtt-z2m-diagnostics/SKILL.md @@ -0,0 +1,41 @@ +--- +name: mqtt-z2m-diagnostics +description: Use the repository MQTT/Zigbee2MQTT diagnostics script for broker-level Home Assistant and Zigbee2MQTT investigations. +--- + +# MQTT and Zigbee2MQTT diagnostics + +Use `scripts\mqtt_z2m_diag.py` for broker-level diagnostics in this repository instead of ad hoc one-off MQTT scripts. + +## When to use this skill + +- Zigbee2MQTT devices are stale, unavailable, or not responding to Home Assistant +- Home Assistant entity state does not match broker state +- You need device records, retained state, `get` responses, `device/configure` probes, permit-join, or pair watches +- You want to compare a broken Zigbee device with a known-good one + +## Requirements + +1. Check `.local\mqtt-home.env` first. +2. If it is missing, incomplete, or stale, ask the user for the current broker details before going deeper. +3. Treat `.local\mqtt-home.env` as local machine state, not repository configuration. + +## Primary commands + +```powershell +python scripts\mqtt_z2m_diag.py health +python scripts\mqtt_z2m_diag.py device-record nitori_salotto_1 +python scripts\mqtt_z2m_diag.py state nitori_salotto_1 +python scripts\mqtt_z2m_diag.py get nitori_salotto_1 --keys state_left state_center state_right +python scripts\mqtt_z2m_diag.py configure nitori_salotto_1 +python scripts\mqtt_z2m_diag.py compare nitori_salotto_1 Switch_Cucina_Neo --configure +python scripts\mqtt_z2m_diag.py permit-join --time 90 --watch +python scripts\mqtt_z2m_diag.py pair-watch nitori_salotto_1 --time 90 --duration 95 +``` + +## Guidance + +- Prefer read-only commands first: `health`, `device-record`, `state`, `get`, `compare`, `watch` +- Use `configure`, `permit-join`, `pair-watch`, or `publish` only when the investigation clearly needs them +- When comparing devices, use the same command flow on a known-good device to prove whether the fault is in the broker path or the device itself +- If the script reveals a new recurring diagnostic need, extend `scripts\mqtt_z2m_diag.py` and keep this skill aligned with it diff --git a/docs/mqtt-broker-broad-analysis.md b/docs/mqtt-broker-broad-analysis.md new file mode 100644 index 0000000..ba0dd43 --- /dev/null +++ b/docs/mqtt-broker-broad-analysis.md @@ -0,0 +1,117 @@ +# MQTT broker broad analysis + +## Scope + +Broad passive review of the MQTT broker used for **casa**, focused on signs of broker stress, unusual traffic, excessive retained state, or noisy publishers that could affect performance or the network. + +## Method + +1. Verified broker reachability on TCP/1883. +2. Took a **45-second full subscription sample** (`#` + `$SYS/#`) to capture retained-state bursts and broker metrics. +3. Took a **60-second steady-state sample** with a **5-second warmup ignored** to separate normal retained snapshots from ongoing traffic. + +## Findings + +### Overall status + +**No obvious broker health issue showed up.** The broker appears stable and not under noticeable pressure: + +| Signal | Observation | +| --- | --- | +| Connected clients | 46 | +| Max clients seen | 47 | +| Subscriptions | 945 | +| Dropped publishes | 0 | +| Retained messages stored | 1026 | +| Retained store size | 1,494,548 bytes | +| Heap current / max | 4,149,612 / 4,887,547 bytes | + +The `$SYS` counters did **not** suggest backlog, churn, or message loss. + +### Traffic shape + +The first sample had a large initial burst, but it was mostly explained by **retained state replay** and **Zigbee2MQTT bridge metadata** sent immediately after subscribing: + +- 970 retained messages were seen on connect. +- Largest payloads were: + - `zigbee2mqtt_2/bridge/definitions` - 245,350 bytes + - `zigbee2mqtt/bridge/definitions` - 217,824 bytes + - `zigbee2mqtt/bridge/devices` - 82,585 bytes + - `zigbee2mqtt_2/bridge/devices` - 81,884 bytes + +That explains the one-shot peak of **1084 messages/second** during the broad sample. It looks like a subscription snapshot, **not** an ongoing flood. + +### Steady-state load + +After excluding the initial retained burst: + +| Metric | Value | +| --- | --- | +| Sample window | 60 seconds | +| Non-retained messages | 1511 | +| Non-retained bytes | 42,949 | +| Average rate | 25.18 messages/second | +| Peak second | 97 messages | +| Unique topics seen | 202 | + +That is a fairly modest steady-state load. The broker is handling a reasonable message rate without signs of distress. + +### Noisiest publishers + +The clear dominant talker is a **Shelly EM3** namespace: + +- Root prefix `shellies` accounted for **1362 / 1511** steady-state messages. +- The top topics were all from `shellies/shellyem3-485519D91C40/emeter/...`. +- Individual EM3 topics appeared **35 times in 60 seconds**, which is chatty but not bandwidth-heavy. + +Important nuance: + +- This is mostly a **message-count** issue, not a **bandwidth** issue. +- The same steady-state sample shows `shellies` produced only **7004 bytes** total. + +So the Shelly EM3 is the main source of ongoing chatter, but it does **not** currently look like a broker or network problem by itself. + +### Large payloads + +Outside the retained startup burst, large payloads were minimal: + +- Only one large non-retained payload was observed in the steady-state sample: + - `frigate/stats` - 10,743 bytes + +Early large-byte topics from `frigate` snapshots and `hass.agent` thumbnails appeared in the broad capture, but they did **not** show up as sustained heavy traffic in the steady-state sample. + +### Topic naming oddities + +Several Zigbee2MQTT topics contain spaces, for example: + +- `zigbee2mqtt/Btcino coso salotto/availability` +- `zigbee2mqtt/Letty Condizionatore Ufficio/availability` + +This is **not** a broker anomaly, but it is worth noting: + +- it can make tooling and ad-hoc topic handling more brittle +- it increases the chance of mistakes in scripts, automations, and CLI work + +If you want cleaner topic hygiene, consider slug-style friendly names for Zigbee2MQTT devices. + +## Conclusion + +### What looks healthy + +- No dropped publishes +- No sign of broker backlog or unstable client churn +- Retained store is present but not unusually large +- Heap usage is not alarming +- Steady-state traffic volume is modest + +### What stands out + +1. **A retained-state burst on subscribe**, mostly from Zigbee2MQTT bridge metadata. This is expected behavior and not a live flood. +2. **A very chatty Shelly EM3 publisher** dominating message count. It is the main thing to watch, but at current byte volume it does not look harmful. +3. **Topic names with spaces** in Zigbee2MQTT. Not a performance issue, but a maintainability footgun. + +## Recommendation + +No urgent remediation is indicated from this pass. + +If you want to reduce noise further, the best next place to look would be the publish frequency/config of `shellies/shellyem3-485519D91C40`, since that device is responsible for most of the ongoing message count. diff --git a/docs/salotto-overview-switch-investigation.md b/docs/salotto-overview-switch-investigation.md new file mode 100644 index 0000000..0e0e37a --- /dev/null +++ b/docs/salotto-overview-switch-investigation.md @@ -0,0 +1,298 @@ +# Salotto Overview Switch Investigation + +Date: 2026-04-17 + +Instance: Casa + +No write actions were taken during this pass. This was a read-only Home Assistant investigation. + +## Scope + +This document investigates why the main **Salotto** light control in the **Overview** dashboard no longer works, while the individual Salotto lights still turn on correctly. + +MQTT-level debugging was not needed for this pass because the Home Assistant entity, dashboard, history, and automation data were enough to isolate the problem. + +## Executive Summary + +The Overview control is not directly toggling the Salotto light group. + +Instead: + +1. The card displays `light.luci_buone_salotto`. +2. Its tap action calls `switch.toggle` on `switch.nitori_salotto_1_left`. +3. The automation `automation.pulsanti_luce_salotto` listens for changes on that switch and then turns `light.luci_buone_salotto` on or off. + +That means the dashboard button depends on an indirect path: + +`Overview card` -> `switch.nitori_salotto_1_left` -> `automation.pulsanti_luce_salotto` -> `light.luci_buone_salotto` + +Right now that path is out of sync: + +- `light.luci_buone_salotto` is currently `off` +- `switch.nitori_salotto_1_left` is currently `on` +- the switch has not changed state since `2026-04-14T16:17:11Z` +- the light group has continued changing independently through `2026-04-17T09:59:53Z` + +So the Overview card is using the light group as its displayed state, but it is controlling a different entity whose state no longer matches the light group. That is the reason the button appears broken. + +## Confirmed Facts + +### 1. The Casa instance was queried + +The active Home Assistant instance returned: + +- location name: `Home` +- base URL: `http://supervisor/core` +- architecture: Casa-sized instance with about `1414` entities and `11` areas + +This matches the expected Casa fingerprint. + +### 2. The Overview dashboard Salotto control targets the wrong entity for a direct light toggle + +In the `lovelace` dashboard, the main Salotto controls were found in multiple places, including: + +- `.views[0].badges[4]` +- `.views[0].sections[0].cards[1]` +- `.views[9].sections[0].cards[3]` + +All of them display `light.luci_buone_salotto`, but the action is: + +```yaml +tap_action: + action: perform-action + perform_action: switch.toggle + target: + entity_id: + - switch.nitori_salotto_1_left +``` + +The icon tap action is wired the same way. + +So the card is not toggling the light entity it displays. + +### 3. The switch is part of a Zigbee2MQTT wall switch device + +`switch.nitori_salotto_1_left` belongs to device: + +- device: `nitori_salotto_1` +- model: `Smart light switch - 3 gang without neutral wire` +- integration: `zigbee2mqtt` + +Related entities: + +- `switch.nitori_salotto_1_left` +- `switch.nitori_salotto_1_center` +- `switch.nitori_salotto_1_right` + +### 4. The automation still links that switch to the Salotto light group + +`automation.pulsanti_luce_salotto` is configured to react to state changes of the left switch and then control the group: + +- if the switch is off -> `light.turn_off` on `light.luci_buone_salotto` +- if the switch is on -> `light.turn_on` on `light.luci_buone_salotto` + +So the dashboard button currently relies on this automation path instead of directly toggling the group. + +### 5. The switch and the light group are no longer aligned + +Current state snapshot: + +- `switch.nitori_salotto_1_left`: `on` +- `light.luci_buone_salotto`: `off` + +Recent history shows: + +- the switch last changed on `2026-04-14T16:17:11Z` +- the light group continued changing after that, including: + - `2026-04-15T15:28:57Z` -> `on` + - `2026-04-15T22:52:06Z` -> `off` + - `2026-04-16T16:03:40Z` -> `on` + - `2026-04-16T22:12:21Z` -> `off` + - `2026-04-17T09:57:13Z` -> `on` + - `2026-04-17T09:59:53Z` -> `off` + +This confirms the light group is being controlled independently of the switch, so the dashboard button and the displayed light state can no longer be trusted to represent the same thing. + +### 6. The automation path itself is not missing + +Recent traces exist for `automation.pulsanti_luce_salotto`, including runs on `2026-04-13` and `2026-04-14`, triggered by `switch.nitori_salotto_1_left`. + +That means the automation definition is present and did run when the switch state changed. The issue is not a missing automation. + +## Diagnosis + +The Overview control broke because it mixes: + +- **display state** from `light.luci_buone_salotto` +- **control action** on `switch.nitori_salotto_1_left` + +This only behaves correctly while the switch state and the light group state stay synchronized. + +They no longer do. + +Once the switch remained `on` while the group was later turned `off` by some other path, pressing the Overview button stopped behaving like a normal room-light toggle. The card looks like a light control, but it is really driving an unrelated intermediate switch entity. + +## Secondary Findings + +- `light.salotto` also exists as a separate group entity that includes all Salotto lights. +- `light.luce_salotto` exists but is currently `unavailable` and appears to be a Tuya entity. It does not appear to be the Overview control target found in this investigation. +- The main failure is the dashboard action wiring, not a full outage of the Salotto light entities. + +## Recommended Fix + +The safest fix is to make the Overview Salotto card toggle the actual light group it displays instead of the wall-switch entity. + +Good options: + +1. Change the card action to toggle `light.luci_buone_salotto` directly. +2. If the desired room-level target is broader, use `light.salotto` directly instead. + +Less robust option: + +1. Keep the switch-based path and add more logic to keep the switch state synchronized with the light group. + +That indirect design is the source of the breakage, so direct light control is the cleaner fix. + +## Applied Changes + +The following Home Assistant changes were applied on the Casa instance during this pass: + +1. The `lovelace` Overview Salotto controls were changed to toggle `light.luci_buone_salotto` directly instead of calling `switch.toggle` on `switch.nitori_salotto_1_left`. +2. A new automation, `automation.sync_salotto_control_switches`, was created to mirror the state of `light.luci_buone_salotto` back to: + - `switch.nitori_salotto_1_left` + - `switch.switch_cucina_neo_l2` + +The intent of that automation is: + +- when the light group turns on -> both control switches should be turned on +- when the light group turns off -> both control switches should be turned off + +This preserves: + +- direct and reliable dashboard control of the actual light group +- consistent switch state for the two wall-switch control paths + +## Additional Finding: the Nitori switch is currently stale at the service layer + +After applying the config fix, a direct `switch.turn_off` call was sent to `switch.nitori_salotto_1_left` to reconcile its stale `on` state with the currently `off` light group. + +The service call was accepted, but Home Assistant could not verify a state change. + +Current evidence: + +- `light.luci_buone_salotto` is `off` +- `switch.switch_cucina_neo_l2` is `off` +- `switch.nitori_salotto_1_left` still reports `on` +- `switch.nitori_salotto_1_left` has not changed state since `2026-04-14T16:17:11Z` +- Home Assistant system logs contain repeated warnings that `switch.nitori_salotto_1_left` is "missing or not currently available" when targeted by services + +This means the original dashboard problem and the current switch-state problem are related but not identical: + +1. **Dashboard problem:** fixed by changing the card to control the light group directly. +2. **Nitori state-sync problem:** still blocked because the `switch.nitori_salotto_1_left` entity is not currently accepting or reflecting Home Assistant service commands reliably. + +## Practical Conclusion + +The reliable control path is now: + +`Overview card` -> `light.luci_buone_salotto` + +The intended synchronization path is now: + +`light.luci_buone_salotto` -> `automation.sync_salotto_control_switches` -> both wall-switch entities + +However, the Nitori leg of that sync will only work once `switch.nitori_salotto_1_left` is healthy again at the Zigbee2MQTT/device layer. + +So the configuration fix is in place, but full synchronization of the Salotto Nitori switch still depends on restoring normal commandability of that switch entity. + +## MQTT and Zigbee2MQTT Restore Diagnostics + +Broker-level MQTT diagnostics were run against the Casa Zigbee2MQTT base topic after creating the local file: + +- `.local/mqtt-home.env` + +### What Zigbee2MQTT still knows about the device + +The retained `bridge/devices` record still contains `nitori_salotto_1`: + +- friendly name: `nitori_salotto_1` +- IEEE address: `0xa4c1386a5b20e7a7` +- model: `TS0013` +- vendor: `Tuya` +- interview state: `SUCCESSFUL` +- supported: `true` +- type: `EndDevice` + +So the device has not been removed from Zigbee2MQTT and still exists in the coordinator database. + +### Important difference vs the working kitchen switch + +Using the same broker-level probe against the working fallback switch `Switch_Cucina_Neo` produced: + +- a live topic payload on `zigbee2mqtt_2/Switch_Cucina_Neo` +- fresh state values such as `state_l2` +- a successful response on `zigbee2mqtt_2/bridge/response/device/configure` + +Using the same probe against `nitori_salotto_1` produced: + +- no live topic payload on `zigbee2mqtt_2/nitori_salotto_1` +- no availability payload +- no bridge log output +- no response on `zigbee2mqtt_2/bridge/response/device/configure` + +This is a strong indicator that Zigbee2MQTT still has the device definition, but the device is not currently responding on the Zigbee network. + +### What this means + +At this point the likely fault domain is one of: + +1. the switch has lost effective connectivity to the Zigbee mesh +2. the switch is powered but stuck and not answering Zigbee commands +3. the device has fallen off the mesh badly enough that reconfigure requests never complete + +This is not behaving like a dashboard, automation, or Home Assistant entity-registry issue anymore. + +## Best Next Restore Step + +The highest-probability recovery action now is: + +1. physically power-cycle the Nitori switch circuit or otherwise restore power to the device +2. immediately retest Zigbee2MQTT commandability +3. if it still does not answer, re-pair or re-interview the device in Zigbee2MQTT while preserving the same friendly name if possible + +Because the kitchen comparison proved the MQTT request flow is correct, there is no value in continuing to tweak the Home Assistant dashboard or automation config until the Nitori device starts answering Zigbee2MQTT again. + +## Successful Recovery + +The device was recovered without a power-cycle by putting `nitori_salotto_1` back into pairing mode while Zigbee2MQTT permit-join was open. + +During the recovery watch, Zigbee2MQTT reported: + +- `bridge/response/permit_join` with `status: ok` +- `bridge/event` with `type: device_announce` for `nitori_salotto_1` +- fresh live payloads again on `zigbee2mqtt_2/nitori_salotto_1` + +Fresh MQTT payloads after re-announce included: + +- `state_left: OFF` +- `state_center: OFF` +- `state_right: OFF` +- `backlight_mode: normal` +- linkquality around `134-145` + +After the re-announce: + +- Home Assistant updated `switch.nitori_salotto_1_left` back to `off` +- `switch.nitori_salotto_1_right` also recovered from stale state and updated to `off` +- the Salotto light group and kitchen fallback switch remained synchronized + +Final validation: + +1. The left Nitori button was pressed once after recovery. +2. `switch.nitori_salotto_1_left` changed from `off` to `on`. +3. `light.luci_buone_salotto` turned `on` immediately after. +4. `switch.switch_cucina_neo_l2` was then synchronized to `on` by `automation.sync_salotto_control_switches`. + +So the switch is now restored at the Zigbee2MQTT layer and the end-to-end control path is working again: + +`nitori_salotto_1_left` -> `automation.pulsanti_luce_salotto` -> `light.luci_buone_salotto` -> `automation.sync_salotto_control_switches` -> fallback switch state diff --git a/scripts/mqtt_z2m_diag.py b/scripts/mqtt_z2m_diag.py new file mode 100644 index 0000000..75f8ce6 --- /dev/null +++ b/scripts/mqtt_z2m_diag.py @@ -0,0 +1,460 @@ +#!/usr/bin/env python +"""Reusable MQTT and Zigbee2MQTT diagnostics for this repository.""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import time +from pathlib import Path +from typing import Any, Iterable + + +REPO_ROOT = Path(__file__).resolve().parents[1] +DEFAULT_ENV_PATH = REPO_ROOT / ".local" / "mqtt-home.env" +DEFAULT_BASE_TOPIC = "zigbee2mqtt_2" + + +def ensure_paho(): + try: + import paho.mqtt.client as mqtt # type: ignore + except Exception: + subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "paho-mqtt"]) + import paho.mqtt.client as mqtt # type: ignore + return mqtt + + +def load_env_file(path: Path) -> None: + if not path.exists(): + raise FileNotFoundError( + f"MQTT env file not found at {path}. Create it first (see .github/copilot-instructions.md)." + ) + for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + os.environ.setdefault(key.strip(), value.strip()) + + +def mqtt_config(env_path: Path) -> dict[str, Any]: + load_env_file(env_path) + missing = [ + key + for key in ("MQTT_HOST", "MQTT_PORT", "MQTT_USERNAME", "MQTT_PASSWORD") + if not os.environ.get(key) + ] + if missing: + raise RuntimeError(f"Missing MQTT settings in {env_path}: {', '.join(missing)}") + return { + "host": os.environ["MQTT_HOST"], + "port": int(os.environ["MQTT_PORT"]), + "username": os.environ["MQTT_USERNAME"], + "password": os.environ["MQTT_PASSWORD"], + "base_topic": os.environ.get("MQTT_BASE_TOPIC", DEFAULT_BASE_TOPIC), + } + + +def parse_json_if_possible(payload: str) -> Any: + try: + return json.loads(payload) + except Exception: + return payload + + +def format_payload(payload: str) -> str: + parsed = parse_json_if_possible(payload) + if isinstance(parsed, (dict, list)): + return json.dumps(parsed, indent=2, sort_keys=True) + return str(parsed) + + +def print_message(topic: str, payload: str) -> None: + print(f"TOPIC: {topic}") + print(format_payload(payload)) + print("---") + + +def normalize_topic(base_topic: str, topic: str) -> str: + if topic.startswith(base_topic + "/") or topic.startswith("$"): + return topic + return f"{base_topic}/{topic}" + + +class CaptureClient: + def __init__( + self, + config: dict[str, Any], + subscriptions: Iterable[str], + publications: Iterable[dict[str, Any]] | None = None, + duration: float = 5.0, + ) -> None: + self._mqtt = ensure_paho() + self.config = config + self.subscriptions = list(dict.fromkeys(subscriptions)) + self.publications = list(publications or []) + self.duration = duration + self.messages: list[tuple[str, str]] = [] + + def run(self) -> list[tuple[str, str]]: + client = self._mqtt.Client(self._mqtt.CallbackAPIVersion.VERSION2) + client.username_pw_set(self.config["username"], self.config["password"]) + + def on_connect(client, userdata, flags, reason_code, properties=None): + for topic in self.subscriptions: + client.subscribe(topic) + for publication in self.publications: + client.publish( + publication["topic"], + publication["payload"], + qos=publication.get("qos", 0), + retain=publication.get("retain", False), + ) + + def on_message(client, userdata, msg): + self.messages.append((msg.topic, msg.payload.decode("utf-8", "replace"))) + + client.on_connect = on_connect + client.on_message = on_message + client.connect(self.config["host"], self.config["port"], 20) + client.loop_start() + time.sleep(self.duration) + client.loop_stop() + client.disconnect() + return self.messages + + +def unique_messages(messages: Iterable[tuple[str, str]]) -> list[tuple[str, str]]: + seen: set[tuple[str, str]] = set() + result: list[tuple[str, str]] = [] + for message in messages: + if message in seen: + continue + seen.add(message) + result.append(message) + return result + + +def capture(args, subscriptions: list[str], publications: list[dict[str, Any]] | None = None) -> list[tuple[str, str]]: + config = mqtt_config(Path(args.env_file)) + client = CaptureClient(config, subscriptions, publications, duration=args.duration) + return unique_messages(client.run()) + + +def cmd_health(args) -> int: + config = mqtt_config(Path(args.env_file)) + messages = CaptureClient(config, [f"{config['base_topic']}/bridge/state"], duration=args.duration).run() + print(f"Broker: {config['host']}:{config['port']}") + print(f"Base topic: {config['base_topic']}") + print("---") + for topic, payload in unique_messages(messages): + print_message(topic, payload) + return 0 + + +def cmd_device_record(args) -> int: + config = mqtt_config(Path(args.env_file)) + messages = CaptureClient(config, [f"{config['base_topic']}/bridge/devices"], duration=args.duration).run() + devices_payload = next((payload for topic, payload in messages if topic.endswith("/bridge/devices")), None) + if not devices_payload: + print("No bridge/devices payload received.", file=sys.stderr) + return 1 + devices = parse_json_if_possible(devices_payload) + if not isinstance(devices, list): + print("Unexpected bridge/devices payload.", file=sys.stderr) + return 1 + matched = [ + device + for device in devices + if not args.devices + or device.get("friendly_name") in args.devices + or device.get("ieee_address") in args.devices + ] + if not matched: + print("No matching devices found.") + return 1 + print(json.dumps(matched, indent=2, sort_keys=True)) + return 0 + + +def cmd_state(args) -> int: + config = mqtt_config(Path(args.env_file)) + subscriptions: list[str] = [] + for device in args.devices: + subscriptions.append(normalize_topic(config["base_topic"], device)) + if args.include_availability: + subscriptions.append(normalize_topic(config["base_topic"], f"{device}/availability")) + messages = capture(args, subscriptions) + if not messages: + print("No state payloads received.") + return 1 + for topic, payload in messages: + print_message(topic, payload) + return 0 + + +def build_get_payload(keys: list[str]) -> str: + if not keys: + return json.dumps({"state": ""}) + return json.dumps({key: "" for key in keys}) + + +def cmd_get(args) -> int: + config = mqtt_config(Path(args.env_file)) + subscriptions = [ + normalize_topic(config["base_topic"], args.device), + normalize_topic(config["base_topic"], f"{args.device}/availability"), + ] + publications = [ + { + "topic": normalize_topic(config["base_topic"], f"{args.device}/get"), + "payload": build_get_payload(args.keys), + } + ] + messages = capture(args, subscriptions, publications) + if not messages: + print("No response received.") + return 1 + for topic, payload in messages: + print_message(topic, payload) + return 0 + + +def cmd_configure(args) -> int: + config = mqtt_config(Path(args.env_file)) + messages = capture( + args, + [ + f"{config['base_topic']}/bridge/response/device/configure", + f"{config['base_topic']}/bridge/log", + normalize_topic(config["base_topic"], args.device), + ], + [ + { + "topic": f"{config['base_topic']}/bridge/request/device/configure", + "payload": json.dumps({"id": args.device}), + } + ], + ) + if not messages: + print("No configure response received.") + return 1 + for topic, payload in messages: + print_message(topic, payload) + return 0 + + +def cmd_permit_join(args) -> int: + config = mqtt_config(Path(args.env_file)) + subscriptions = [f"{config['base_topic']}/bridge/response/permit_join"] + if args.watch: + subscriptions.extend( + [ + f"{config['base_topic']}/bridge/log", + f"{config['base_topic']}/bridge/event", + ] + ) + messages = capture( + args, + subscriptions, + [ + { + "topic": f"{config['base_topic']}/bridge/request/permit_join", + "payload": json.dumps({"value": True, "time": args.time}), + } + ], + ) + if not messages: + print("No permit_join response received.") + return 1 + for topic, payload in messages: + print_message(topic, payload) + return 0 + + +def cmd_pair_watch(args) -> int: + config = mqtt_config(Path(args.env_file)) + subscriptions = [ + f"{config['base_topic']}/bridge/response/permit_join", + f"{config['base_topic']}/bridge/response/device/configure", + f"{config['base_topic']}/bridge/response/device/interview", + f"{config['base_topic']}/bridge/event", + f"{config['base_topic']}/bridge/log", + normalize_topic(config["base_topic"], args.device), + normalize_topic(config["base_topic"], f"{args.device}/availability"), + ] + messages = capture( + args, + subscriptions, + [ + { + "topic": f"{config['base_topic']}/bridge/request/permit_join", + "payload": json.dumps({"value": True, "time": args.time}), + } + ], + ) + if not messages: + print("No pair-watch messages received.") + return 1 + for topic, payload in messages: + print_message(topic, payload) + return 0 + + +def cmd_compare(args) -> int: + compare_args = argparse.Namespace(**vars(args)) + compare_args.devices = [args.device_a, args.device_b] + compare_args.include_availability = True + print("== DEVICE RECORDS ==") + record_rc = cmd_device_record(compare_args) + print("== CURRENT STATE ==") + state_rc = cmd_state(compare_args) + if args.configure: + for device in (args.device_a, args.device_b): + print(f"== CONFIGURE {device} ==") + configure_args = argparse.Namespace(**vars(args)) + configure_args.device = device + cmd_configure(configure_args) + return 0 if record_rc == 0 and state_rc == 0 else 1 + + +def cmd_watch(args) -> int: + config = mqtt_config(Path(args.env_file)) + subscriptions = [normalize_topic(config["base_topic"], topic) for topic in args.topics] + if args.include_bridge_log: + subscriptions.append(f"{config['base_topic']}/bridge/log") + if args.include_bridge_event: + subscriptions.append(f"{config['base_topic']}/bridge/event") + messages = capture(args, subscriptions) + if not messages: + print("No watched messages received.") + return 1 + for topic, payload in messages: + print_message(topic, payload) + return 0 + + +def cmd_publish(args) -> int: + config = mqtt_config(Path(args.env_file)) + topic = normalize_topic(config["base_topic"], args.topic) + payload = args.payload + if args.json: + payload = json.dumps(json.loads(payload)) + messages = capture( + args, + [topic] if args.echo else [], + [{"topic": topic, "payload": payload, "retain": args.retain, "qos": args.qos}], + ) + print(f"Published to {topic}") + if args.echo: + for message_topic, message_payload in messages: + print_message(message_topic, message_payload) + return 0 + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="MQTT and Zigbee2MQTT diagnostics for this repository.", + epilog=( + "Examples:\n" + " python scripts\\mqtt_z2m_diag.py health\n" + " python scripts\\mqtt_z2m_diag.py device-record nitori_salotto_1\n" + " python scripts\\mqtt_z2m_diag.py get nitori_salotto_1 --keys state_left state_center state_right\n" + " python scripts\\mqtt_z2m_diag.py configure nitori_salotto_1\n" + " python scripts\\mqtt_z2m_diag.py pair-watch nitori_salotto_1 --time 90 --duration 95\n" + " python scripts\\mqtt_z2m_diag.py compare nitori_salotto_1 Switch_Cucina_Neo --configure\n" + ), + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--env-file", + default=str(DEFAULT_ENV_PATH), + help="Path to the local MQTT env file. Default: .local\\mqtt-home.env", + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + health = subparsers.add_parser("health", help="Check broker connectivity and Zigbee2MQTT bridge state.") + health.add_argument("--duration", type=float, default=3.0) + health.set_defaults(func=cmd_health) + + device_record = subparsers.add_parser( + "device-record", + help="Read Zigbee2MQTT bridge/devices and print selected device records.", + ) + device_record.add_argument("devices", nargs="*", help="Friendly names or IEEE addresses.") + device_record.add_argument("--duration", type=float, default=4.0) + device_record.set_defaults(func=cmd_device_record) + + state = subparsers.add_parser("state", help="Read retained/live device state topics.") + state.add_argument("devices", nargs="+", help="Device topics without the base topic prefix.") + state.add_argument("--include-availability", action="store_true", default=True) + state.add_argument("--duration", type=float, default=4.0) + state.set_defaults(func=cmd_state) + + get_parser = subparsers.add_parser("get", help="Send a Zigbee2MQTT device get request and wait for replies.") + get_parser.add_argument("device", help="Device topic without the base topic prefix.") + get_parser.add_argument("--keys", nargs="*", default=[], help="Keys to request. Default requests state.") + get_parser.add_argument("--duration", type=float, default=6.0) + get_parser.set_defaults(func=cmd_get) + + configure = subparsers.add_parser("configure", help="Ask Zigbee2MQTT to configure a device.") + configure.add_argument("device", help="Device friendly name.") + configure.add_argument("--duration", type=float, default=10.0) + configure.set_defaults(func=cmd_configure) + + permit_join = subparsers.add_parser("permit-join", help="Open Zigbee permit-join for a short time.") + permit_join.add_argument("--time", type=int, default=90, help="Permit-join duration in seconds.") + permit_join.add_argument("--watch", action="store_true", help="Also watch bridge log/event topics.") + permit_join.add_argument("--duration", type=float, default=8.0) + permit_join.set_defaults(func=cmd_permit_join) + + pair_watch = subparsers.add_parser( + "pair-watch", + help="Open permit-join and watch a device for announce/interview/configure/state traffic.", + ) + pair_watch.add_argument("device", help="Device friendly name.") + pair_watch.add_argument("--time", type=int, default=90, help="Permit-join duration in seconds.") + pair_watch.add_argument("--duration", type=float, default=95.0) + pair_watch.set_defaults(func=cmd_pair_watch) + + compare = subparsers.add_parser( + "compare", + help="Compare two devices by bridge record and current state, with optional configure probes.", + ) + compare.add_argument("device_a") + compare.add_argument("device_b") + compare.add_argument("--configure", action="store_true", help="Also issue configure requests.") + compare.add_argument("--duration", type=float, default=6.0) + compare.set_defaults(func=cmd_compare) + + watch = subparsers.add_parser("watch", help="Watch arbitrary MQTT topics under the configured base topic.") + watch.add_argument("topics", nargs="+", help="Topics relative to the base topic unless already absolute.") + watch.add_argument("--include-bridge-log", action="store_true") + watch.add_argument("--include-bridge-event", action="store_true") + watch.add_argument("--duration", type=float, default=10.0) + watch.set_defaults(func=cmd_watch) + + publish = subparsers.add_parser("publish", help="Publish a raw MQTT payload, optionally echoing replies.") + publish.add_argument("topic", help="Topic relative to the base topic unless already absolute.") + publish.add_argument("payload", help="Payload to publish.") + publish.add_argument("--json", action="store_true", help="Validate and normalize payload as JSON before publish.") + publish.add_argument("--retain", action="store_true") + publish.add_argument("--qos", type=int, default=0, choices=(0, 1, 2)) + publish.add_argument("--echo", action="store_true", help="Subscribe to the same topic and print any messages seen.") + publish.add_argument("--duration", type=float, default=4.0) + publish.set_defaults(func=cmd_publish) + + return parser + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main())