From 0cd0d956124f9165b872d4e9a9ca3546581d6b17 Mon Sep 17 00:00:00 2001 From: RobbStarkAustria Date: Sun, 5 Apr 2026 08:36:50 +0200 Subject: [PATCH] feat: remote commands, systemd units, process observability, broker auth split - Command intake (reboot/shutdown) on infoscreen/{uuid}/commands with ack lifecycle - MQTT_USER/MQTT_PASSWORD_BROKER split from identity vars; configure_mqtt_security() updated - infoscreen-simclient.service: Type=notify, WatchdogSec=60, Restart=on-failure - infoscreen-notify-failure@.service + script: retained MQTT alert when systemd gives up (Gap 3) - _sd_notify() watchdog keepalive in simclient main loop (Gap 1) - broker_connection block in health payload: reconnect_count, last_disconnect_at (Gap 2) - COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE canary flag with safety guard - SERVER_TEAM_ACTIONS.md: server-side integration action items - Docs: README, CHANGELOG, src/README, copilot-instructions updated - 43 tests passing --- .env.template | 35 +- .github/copilot-instructions.md | 6 +- CHANGELOG.md | 41 ++ README.md | 44 +- SERVER_TEAM_ACTIONS.md | 127 ++++ TODO.md | 88 +++ .../reboot-command-payload-schemas.json | 149 ++++ .../reboot-command-payload-schemas.md | 59 ++ ...boot-implementation-handoff-client-team.md | 169 +++++ .../reboot-implementation-handoff-share.md | 175 +++++ .../reboot-kickoff-summary.md | 54 ++ scripts/infoscreen-cmd-helper.sh | 27 + scripts/infoscreen-display.service | 2 + scripts/infoscreen-notify-failure.sh | 55 ++ scripts/infoscreen-notify-failure@.service | 19 + scripts/infoscreen-simclient.service | 55 ++ scripts/install-command-helper.sh | 24 + scripts/mock-command-helper.sh | 34 + scripts/start-simclient.sh | 35 + scripts/test-mqtt.sh | 22 +- scripts/test-power-intent.sh | 33 +- scripts/test-reboot-command.sh | 244 +++++++ src/.env.production.template | 7 + src/.env.template | 7 + src/README.md | 44 +- src/pi-setup.sh | 45 +- src/simclient.py | 636 +++++++++++++++++- tests/test_command_intake.py | 287 ++++++++ 28 files changed, 2487 insertions(+), 36 deletions(-) create mode 100644 SERVER_TEAM_ACTIONS.md create mode 100644 implementation-plans/reboot-command-payload-schemas.json create mode 100644 implementation-plans/reboot-command-payload-schemas.md create mode 100644 implementation-plans/reboot-implementation-handoff-client-team.md create mode 100644 implementation-plans/reboot-implementation-handoff-share.md create mode 100644 implementation-plans/reboot-kickoff-summary.md create mode 100755 scripts/infoscreen-cmd-helper.sh create mode 100755 scripts/infoscreen-notify-failure.sh create mode 100644 scripts/infoscreen-notify-failure@.service create mode 100644 scripts/infoscreen-simclient.service create mode 100755 scripts/install-command-helper.sh create mode 100755 scripts/mock-command-helper.sh create mode 100755 scripts/start-simclient.sh create mode 100755 scripts/test-reboot-command.sh create mode 100644 tests/test_command_intake.py diff --git a/.env.template b/.env.template index cd723a3..7de90b8 100644 --- a/.env.template +++ b/.env.template @@ -16,6 +16,17 @@ LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR # MQTT Broker Configuration MQTT_BROKER= # Change to your MQTT server IP MQTT_PORT=1883 +# Broker login used by simclient to connect to MQTT +MQTT_USER= +MQTT_PASSWORD_BROKER= +# Optional per-device identity credentials (legacy fallback) +MQTT_USERNAME=infoscreen-client- +MQTT_PASSWORD= +MQTT_TLS_ENABLED=0 # 1 when broker TLS is enabled for this client +# MQTT_TLS_CA_CERT=/etc/infoscreen/mqtt/ca.crt +# MQTT_TLS_CERT=/etc/infoscreen/mqtt/client.crt +# MQTT_TLS_KEY=/etc/infoscreen/mqtt/client.key +# MQTT_TLS_INSECURE=0 # only for controlled test environments # Timing Configuration (quieter intervals for productive test) HEARTBEAT_INTERVAL=60 # Heartbeat frequency in seconds @@ -47,16 +58,26 @@ CEC_POWER_OFF_WAIT=5 # Seconds to wait after power OFF command (increase # hybrid — prefer MQTT intent when present and valid; fall back to local CEC if not # mqtt — MQTT intent is authoritative; local CEC only fires as last-resort guard # See README.md "TV Power Intent — Rollout Runbook" before changing from 'local'. -POWER_CONTROL_MODE=local # local | hybrid | mqtt +POWER_CONTROL_MODE=hybrid # local | hybrid | mqtt -# Optional: MQTT authentication (if your broker requires username/password) -#MQTT_USERNAME= -#MQTT_PASSWORD= +# Reboot/Shutdown command handling +# Helper installed by ./scripts/install-command-helper.sh +COMMAND_HELPER_PATH=/usr/local/bin/infoscreen-cmd-helper.sh +# Mock mode (safe canary): uncomment next line and comment the live path above +# COMMAND_HELPER_PATH=/home/olafn/infoscreen-dev/scripts/mock-command-helper.sh +# Timeout for helper execution (seconds) +COMMAND_EXEC_TIMEOUT_SEC=15 +# Test mode: for reboot_host with mock helper, send completed without restart (0/1) +COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE=0 +# Command deduplication retention window (hours) +COMMAND_DEDUPE_TTL_HOURS=24 +# Maximum processed command IDs kept in dedupe cache +COMMAND_DEDUPE_MAX_ENTRIES=5000 + +# MQTT authentication +# Use a per-client service account. Keep this file mode 600 on the device. # Optional TLS settings (if using secure MQTT) -#MQTT_TLS_CA_CERT= -#MQTT_TLS_CERT= -#MQTT_TLS_KEY= # Notes: # - Keep actual secrets and host-specific values in a local .env file that is NOT committed. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e38733c..679228e 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -45,6 +45,7 @@ Use specialist docs for deep operational details: - `HDMI_CEC_SETUP.md` (CEC setup/troubleshooting) - `SCREENSHOT_MQTT_FIX.md` (screenshot race-condition fixes) - `src/README.md` (developer-focused architecture/debugging) +- `SERVER_TEAM_ACTIONS.md` (server-side integration action items) ## Critical Rules @@ -60,12 +61,13 @@ Use specialist docs for deep operational details: - Root `README.md` is a landing page; do not re-expand it into a full manual. - TV power rollout guidance lives in `TV_POWER_RUNBOOK.md`. - TV power contract truth lives in `TV_POWER_INTENT_SERVER_CONTRACT_V1.md`. +- `MQTT_USER`/`MQTT_PASSWORD_BROKER` are broker login credentials; `MQTT_USERNAME`/`MQTT_PASSWORD` are legacy identity fields. Never confuse the two. ## Architecture Snapshot Two-process design: -- `src/simclient.py`: MQTT communication, discovery, group assignment, event intake, heartbeat, dashboard publish, power intent ingestion. +- `src/simclient.py`: MQTT communication, discovery, group assignment, event intake, heartbeat, dashboard publish, power intent ingestion, remote command intake. - `src/display_manager.py`: content display lifecycle, HDMI-CEC, screenshot capture, runtime process health. Runtime coordination files: @@ -105,6 +107,8 @@ Runtime coordination files: - Power intent application: `src/display_manager.py` -> `_apply_mqtt_power_intent()` - Screenshot capture logic: `src/display_manager.py` -> `_capture_screenshot()` - Dashboard payload: `src/simclient.py` -> `_build_dashboard_payload()` +- Remote command intake: `src/simclient.py` -> `on_command_message()` +- Command validation: `src/simclient.py` -> `validate_command_payload()` - File URL rewriting: `src/simclient.py` -> `resolve_file_url()` ## Documentation Policy diff --git a/CHANGELOG.md b/CHANGELOG.md index e10f8d5..1c05783 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,47 @@ ## April 2026 +### Remote Command Intake + +- Added MQTT command intake on `infoscreen/{client_id}/commands` (supports `reboot` and `shutdown`). +- Added command acknowledgement publishing to `infoscreen/{client_id}/commands/ack` and `infoscreen/{client_id}/command/ack` with states `accepted`, `rejected`, `execution_started`, `completed`, `failed`. +- Added `COMMAND_HELPER_PATH` environment variable; command execution delegated to an external shell helper so `simclient.py` requires no elevated privileges. +- Added deduplication of commands by `command_id` with configurable TTL (`COMMAND_DEDUPE_TTL_HOURS`) and max-entries cap (`COMMAND_DEDUPE_MAX_ENTRIES`). +- Added execution timeout (`COMMAND_EXEC_TIMEOUT_SEC`). +- Added `COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE` flag for canary and test environments — immediately completes a mock reboot without waiting for process restart. Safety-guarded: only activates when the helper basename is `mock-command-helper.sh`. + +### MQTT Broker Authentication Split + +- Split broker connection credentials (`MQTT_USER`, `MQTT_PASSWORD_BROKER`) from legacy per-device identity fields (`MQTT_USERNAME`, `MQTT_PASSWORD`). +- `configure_mqtt_security()` now prefers `MQTT_USER`/`MQTT_PASSWORD_BROKER` for broker login, with fallback to legacy vars if broker-specific vars are absent. + +### Systemd Service Units + +- Added `scripts/infoscreen-simclient.service` — systemd unit for `simclient.py` with `Type=notify`, `WatchdogSec=60`, `Restart=on-failure`, `StartLimitBurst=5`. +- Added `scripts/start-simclient.sh` — launcher script mirroring `start-display-manager.sh`. +- Updated `scripts/infoscreen-display.service` with `OnFailure=infoscreen-notify-failure@%n.service`. +- Updated `src/pi-setup.sh` to install and enable both units plus the failure notifier template. + +### Process Watchdog (Gap 1 — Hung Process Detection) + +- Added zero-dependency `_sd_notify()` raw socket helper in `simclient.py` (no `systemd-python` package required). +- Sends `READY=1` on main loop entry and `WATCHDOG=1` on every 5-second iteration. +- Service unit uses `Type=notify` and `WatchdogSec=60`; systemd will restart the process if it stops sending keepalives for 60 seconds. + +### OnFailure MQTT Notifier (Gap 3 — systemd Give-Up Detection) + +- Added `scripts/infoscreen-notify-failure@.service` — systemd template unit triggered by `OnFailure=`. +- Added `scripts/infoscreen-notify-failure.sh` — publishes a retained JSON payload to `infoscreen/{uuid}/service_failed` via `mosquitto_pub` so the monitoring dashboard gets an alert even when the process is fully dead. +- Payload: `{"event":"service_failed","unit":"","client_uuid":"...","failed_at":""}`. + +### Health Payload Broker Connection Block (Gap 2 — Broker vs. Process Ambiguity) + +- Added `broker_connection` block to the health payload: `broker_reachable`, `reconnect_count`, `connect_count`, `last_disconnect_at`. +- `simclient.py` now tracks `reconnect_count` and `connect_count` on every `on_connect` callback and `last_disconnect` timestamp on `on_disconnect`. +- `publish_health_message()` accepts an optional `connection_state` parameter; both heartbeat-success call sites pass the enriched state. + +### TV Power Coordination + - Added Phase 1 TV power coordination on `infoscreen/groups/{group_id}/power/intent`. - Added `POWER_CONTROL_MODE` with `local`, `hybrid`, and `mqtt` behavior. - Added `src/power_intent_state.json` and `src/power_state.json` for power IPC and telemetry. diff --git a/README.md b/README.md index e112c41..f9df1a1 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,11 @@ LOG_LEVEL=INFO MQTT_BROKER=192.168.1.100 MQTT_PORT=1883 +MQTT_USER= +MQTT_PASSWORD_BROKER= +MQTT_USERNAME=infoscreen-client- +MQTT_PASSWORD= +MQTT_TLS_ENABLED=0 HEARTBEAT_INTERVAL=60 SCREENSHOT_INTERVAL=180 @@ -68,8 +73,22 @@ CEC_POWER_ON_WAIT=5 CEC_POWER_OFF_WAIT=5 POWER_CONTROL_MODE=local + +COMMAND_HELPER_PATH=/usr/local/bin/infoscreen-cmd-helper.sh +COMMAND_EXEC_TIMEOUT_SEC=15 +COMMAND_DEDUPE_TTL_HOURS=24 +COMMAND_DEDUPE_MAX_ENTRIES=5000 +COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE=0 ``` +MQTT auth/TLS notes: + +- `MQTT_USER` / `MQTT_PASSWORD_BROKER` are the broker credentials used at connection time. +- `MQTT_USERNAME` / `MQTT_PASSWORD` are legacy per-device identity fields kept for fallback and identity purposes. +- Store real broker credentials only in the local [/.env](.env), which is gitignored. +- When TLS is enabled, also set `MQTT_TLS_CA_CERT`, and if client certificates are used, `MQTT_TLS_CERT` and `MQTT_TLS_KEY`. +- Keep the local [/.env](.env) readable only by the service user and admins, for example mode `600`. + Mode summary: - `POWER_CONTROL_MODE=local`: local event-time CEC only. @@ -78,21 +97,23 @@ Mode summary: ### 3. Start Services +The preferred method on deployed devices is systemd: + ```bash -cd ~/infoscreen-dev/src -python3 simclient.py +sudo systemctl start infoscreen-simclient infoscreen-display +sudo systemctl status infoscreen-simclient infoscreen-display +sudo journalctl -u infoscreen-simclient -u infoscreen-display -f ``` -In a second terminal: +For first-time setup, run `src/pi-setup.sh` to install and enable the units. See [src/README.md](src/README.md) for the systemd setup steps. + +For local development without systemd: ```bash -cd ~/infoscreen-dev/src -python3 display_manager.py -``` +# Terminal 1 +./scripts/start-simclient.sh -Or use the helper script: - -```bash +# Terminal 2 ./scripts/start-display-manager.sh ``` @@ -154,6 +175,7 @@ Use the helper scripts in `scripts/` for focused tests: - `./scripts/test-impressive.sh`: single-play presentation. - `./scripts/test-impressive-loop.sh`: looping presentation. - `./scripts/test-mqtt.sh`: MQTT broker connectivity. +- `./scripts/test-reboot-command.sh`: end-to-end reboot/shutdown command lifecycle canary (`accepted -> execution_started -> completed/failed`). - `./scripts/test-screenshot.sh`: screenshot capture. - `./scripts/test-hdmi-cec.sh`: HDMI-CEC diagnostics and runtime state inspection. - `./scripts/test-power-intent.sh`: MQTT power intent publishing, rejection tests, and telemetry checks. @@ -173,7 +195,8 @@ Quick checks: - Follow logs: `tail -f logs/display_manager.log src/simclient.log` - Inspect screenshots: `ls -lh src/screenshots/` - Inspect power state: `cat src/power_intent_state.json` and `cat src/power_state.json` -- Restart both services: `./scripts/restart-all.sh` +- Restart services (systemd): `sudo systemctl restart infoscreen-simclient infoscreen-display` +- Restart services (dev): `./scripts/restart-all.sh` ## Deployment @@ -213,6 +236,7 @@ If running directly on the host, ensure: - [src/IMPLEMENTATION_SUMMARY.md](src/IMPLEMENTATION_SUMMARY.md) - [TV_POWER_COORDINATION_TASKLIST.md](TV_POWER_COORDINATION_TASKLIST.md) - [TV_POWER_HANDOFF_SERVER.md](TV_POWER_HANDOFF_SERVER.md) +- [SERVER_TEAM_ACTIONS.md](SERVER_TEAM_ACTIONS.md) ## Contributing diff --git a/SERVER_TEAM_ACTIONS.md b/SERVER_TEAM_ACTIONS.md new file mode 100644 index 0000000..a28faa8 --- /dev/null +++ b/SERVER_TEAM_ACTIONS.md @@ -0,0 +1,127 @@ +# Server Team Action Items — Infoscreen Client + +This document lists everything the server/infrastructure/frontend team must implement to complete the client integration. The client-side code is production-ready for all items listed here. + +--- + +## 1. MQTT Broker Hardening (prerequisite for everything else) + +- Disable anonymous access on the broker. +- Create one broker account **per client device**: + - Username convention: `infoscreen-client-` (e.g. `infoscreen-client-9b8d1856`) + - Provision the password to the device `.env` as `MQTT_PASSWORD_BROKER=` +- Create a **server/publisher account** (e.g. `infoscreen-server`) for all server-side publishes. +- Enforce ACLs: + +| Topic | Publisher | +|---|---| +| `infoscreen/{uuid}/commands` | server only | +| `infoscreen/{uuid}/command` (alias) | server only | +| `infoscreen/{uuid}/group_id` | server only | +| `infoscreen/events/{group_id}` | server only | +| `infoscreen/groups/+/power/intent` | server only | +| `infoscreen/{uuid}/commands/ack` | client only | +| `infoscreen/{uuid}/command/ack` | client only | +| `infoscreen/{uuid}/heartbeat` | client only | +| `infoscreen/{uuid}/health` | client only | +| `infoscreen/{uuid}/logs/#` | client only | +| `infoscreen/{uuid}/service_failed` | client only | + +--- + +## 2. Reboot / Shutdown Command — Ack Lifecycle + +Client publishes ack status updates to two topics per command (canonical + transitional alias): +- `infoscreen/{uuid}/commands/ack` +- `infoscreen/{uuid}/command/ack` + +**Ack payload schema (v1, frozen):** +```json +{ + "command_id": "07aab032-53c2-45ef-a5a3-6aa58e9d9fae", + "status": "accepted | execution_started | completed | failed", + "error_code": null, + "error_message": null +} +``` + +**Status lifecycle:** + +| Status | When | Notes | +|---|---|---| +| `accepted` | Command received and validated | Immediate | +| `execution_started` | Helper invoked | Immediate after accepted | +| `completed` | Execution confirmed | For `reboot_host`: arrives after reconnect (10–90 s after `execution_started`) | +| `failed` | Helper returned error | `error_code` and `error_message` will be set | + +**Server must:** +- Track `command_id` through the full lifecycle and update status in DB/UI. +- Surface `failed` + `error_code` to the operator UI. +- Expect `reboot_host` `completed` to arrive after a reconnect delay — do not treat the gap as a timeout. +- Use `expires_at` from the original command to determine when to abandon waiting. + +--- + +## 3. Health Dashboard — Broker Connection Fields (Gap 2) + +Every `infoscreen/{uuid}/health` payload now includes a `broker_connection` block: + +```json +{ + "timestamp": "2026-04-05T08:00:00.000000+00:00", + "expected_state": { "event_id": 42 }, + "actual_state": { + "process": "display_manager", + "pid": 1234, + "status": "running" + }, + "broker_connection": { + "broker_reachable": true, + "reconnect_count": 2, + "last_disconnect_at": "2026-04-04T10:30:00Z" + } +} +``` + +**Server must:** +- Display `reconnect_count` and `last_disconnect_at` per device in the health dashboard. +- Implement alerting heuristic: + - **All** clients go silent simultaneously → likely broker outage, not device crash. + - **Single** client goes silent → device crash, network failure, or process hang. + +--- + +## 4. Service-Failed MQTT Notification (Gap 3) + +When systemd gives up restarting a service after repeated crashes (`StartLimitBurst` exceeded), the client automatically publishes a **retained** message: + +**Topic:** `infoscreen/{uuid}/service_failed` + +**Payload:** +```json +{ + "event": "service_failed", + "unit": "infoscreen-simclient.service", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "failed_at": "2026-04-05T08:00:00Z" +} +``` + +**Server must:** +- Subscribe to `infoscreen/+/service_failed` on startup (retained — message survives broker restart). +- Alert the operator immediately when this topic receives a payload. +- **Clear the retained message** once the device is acknowledged or recovered: + ``` + mosquitto_pub -t "infoscreen/{uuid}/service_failed" -n --retain + ``` + +--- + +## 5. No Server Action Required + +These items are fully implemented client-side and require no server changes: + +- systemd watchdog (`WatchdogSec=60`) — hangs detected and process restarted automatically. +- Command deduplication — `command_id` deduplicated with 24-hour TTL. +- Ack retry backoff — client retries ack publish on broker disconnect until `expires_at`. +- Mock helper / test mode (`COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE`) — development only. diff --git a/TODO.md b/TODO.md index 02730b6..d1265dc 100644 --- a/TODO.md +++ b/TODO.md @@ -25,10 +25,98 @@ This file tracks higher-level todos and design notes for the infoscreen client. - `set_volume()` issues appropriate CEC commands and returns success/failure. - Document any platform limitations (some TVs don't support absolute volume via CEC). +## Systemd crash recovery (server team recommendation) + +Reliable restart-on-crash for both processes must be handled by **systemd**, not by in-process watchdogs or ad-hoc shell scripts. + +### What needs to be done + +- `display_manager`: already has `scripts/infoscreen-display.service` with `Restart=on-failure` / `RestartSec=10`. + - Review `RestartSec` — may want a short backoff (e.g. 5–15 s) and `StartLimitIntervalSec` + `StartLimitBurst` to prevent thrash loops. +- `simclient`: **no service unit exists yet**. + - Create `scripts/infoscreen-simclient.service` modelled on the display service. + - Use `Restart=on-failure` and `RestartSec=10`. + - Wire `EnvironmentFile=/home/olafn/infoscreen-dev/.env` so the unit picks up `.env` variables automatically. + - Set `After=network-online.target` so MQTT connection is not attempted before the network is ready. +- Both units should be installed and enabled via `src/pi-setup.sh` (`systemctl enable --now`). +- After enabling, verify crash recovery with `kill -9 ` and confirm systemd restarts the process within `RestartSec`. + +### Acceptance criteria + +- Both `simclient` and `display_manager` restart automatically within 15 s of any non-intentional exit. +- `systemctl status` shows `active (running)` after a crash-induced restart. +- `journalctl -u infoscreen-simclient` captures all process output (stdout + stderr). +- `pi-setup.sh` idempotently installs and enables both units. + +### Notes + +- Use `Restart=on-failure` — restarts on crashes and signals but not on clean `systemctl stop`, preserving operator control during deployments. +- The reboot/shutdown command flow publishes `execution_started` and then exits intentionally; systemd will restart simclient, and the recovery logic in the heartbeat loop will emit `completed` on reconnect. This is the intended lifecycle. + +## Process health observability gaps + +Two scenarios are currently undetected or ambiguous from the server/frontend perspective. + +### Gap 1: Hung / deadlocked process ✅ implemented + +**Solution implemented:** Zero-dependency `_sd_notify()` helper writes directly to `NOTIFY_SOCKET` (raw Unix socket, no extra package). `READY=1` is sent when the heartbeat loop starts; `WATCHDOG=1` is sent every 5 s in the main loop iteration. The service unit uses `Type=notify` + `WatchdogSec=60` — if the main loop freezes for 60 s, systemd kills and restarts the process automatically. + +**To apply on device:** +```bash +sudo cp ~/infoscreen-dev/scripts/infoscreen-simclient.service /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl restart infoscreen-simclient +``` + +### Gap 2: MQTT broker unreachable vs. simclient dead ✅ implemented (client side) + +**Solution implemented:** `connection_state` dict expanded with `reconnect_count` and `connect_count`. `publish_health_message()` now accepts `connection_state` and appends a `broker_connection` block to every health payload: + +```json +"broker_connection": { + "broker_reachable": true, + "reconnect_count": 2, + "last_disconnect_at": "2026-04-04T10:00:00Z" +} +``` + +`broker_reachable` = `true` when MQTT is connected at publish time. +`reconnect_count` increments on every reconnection (first connect does not count). +`last_disconnect_at` is the UTC timestamp of the most recent disconnect. + +**Server-side action still needed:** +- Display `reconnect_count` and `last_disconnect_at` in the frontend health dashboard. +- Alert heuristic: if **all** clients go silent simultaneously → likely broker issue; if only one → likely device issue. + +### Gap 3: systemd gives up (StartLimitBurst exceeded) ✅ implemented + +**Solution implemented:** `scripts/infoscreen-notify-failure@.service` (template unit) + `scripts/infoscreen-notify-failure.sh`. Both main units have `OnFailure=infoscreen-notify-failure@%n.service`. When systemd marks a service `failed`, the notifier runs once, reads broker credentials from `.env`, reads `client_uuid.txt`, and publishes a retained JSON payload to `infoscreen/{uuid}/service_failed` via `mosquitto_pub`. + +**To apply on device:** +```bash +sudo cp ~/infoscreen-dev/scripts/infoscreen-notify-failure@.service /etc/systemd/system/ +sudo cp ~/infoscreen-dev/scripts/infoscreen-simclient.service /etc/systemd/system/ +sudo cp ~/infoscreen-dev/scripts/infoscreen-display.service /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl restart infoscreen-simclient infoscreen-display +``` + +**Topic:** `infoscreen/{client_uuid}/service_failed` (retained) +**Payload:** `{"event":"service_failed","unit":"infoscreen-simclient.service","client_uuid":"...","failed_at":"2026-..."}` + ## Next-high-level items - Add environment-controlled libVLC hw-accel toggle (`VLC_HW_ACCEL=1|0`) to `display_manager.py` so software decode can be forced when necessary. - Add automated tests for video start/stop lifecycle (mock python-vlc) to ensure resources are released on event end. +- Add allowlist validation for `website` / `webuntis` event URLs + - Goal: restrict browser-based events to approved hosts and schemes even if an authenticated publisher sends an unsafe URL. + - Ideas / approaches: + - Add env-configurable allowlists for general website hosts and WebUntis hosts. + - Allow only `https` by default and reject `file:`, `data:`, `javascript:`, loopback, and private-address URLs unless explicitly allowed. + - Enforce the same validation on both server-side payload generation and client-side execution in `display_manager.py`. + - Acceptance criteria: + - Unsafe or unapproved URLs are rejected before Chromium is launched. + - WebUntis and approved website events still work with explicit allowlist configuration. ## Notes diff --git a/implementation-plans/reboot-command-payload-schemas.json b/implementation-plans/reboot-command-payload-schemas.json new file mode 100644 index 0000000..5f96392 --- /dev/null +++ b/implementation-plans/reboot-command-payload-schemas.json @@ -0,0 +1,149 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://infoscreen.local/schemas/reboot-command-payload-schemas.json", + "title": "Infoscreen Reboot Command Payload Schemas", + "description": "Frozen v1 schemas for per-client command and command acknowledgement payloads.", + "$defs": { + "commandPayloadV1": { + "type": "object", + "additionalProperties": false, + "required": [ + "schema_version", + "command_id", + "client_uuid", + "action", + "issued_at", + "expires_at", + "requested_by", + "reason" + ], + "properties": { + "schema_version": { + "type": "string", + "const": "1.0" + }, + "command_id": { + "type": "string", + "format": "uuid" + }, + "client_uuid": { + "type": "string", + "format": "uuid" + }, + "action": { + "type": "string", + "enum": [ + "reboot_host", + "shutdown_host" + ] + }, + "issued_at": { + "type": "string", + "format": "date-time" + }, + "expires_at": { + "type": "string", + "format": "date-time" + }, + "requested_by": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "reason": { + "type": [ + "string", + "null" + ], + "maxLength": 2000 + } + } + }, + "commandAckPayloadV1": { + "type": "object", + "additionalProperties": false, + "required": [ + "command_id", + "status", + "error_code", + "error_message" + ], + "properties": { + "command_id": { + "type": "string", + "format": "uuid" + }, + "status": { + "type": "string", + "enum": [ + "accepted", + "execution_started", + "completed", + "failed" + ] + }, + "error_code": { + "type": [ + "string", + "null" + ], + "maxLength": 128 + }, + "error_message": { + "type": [ + "string", + "null" + ], + "maxLength": 4000 + } + }, + "allOf": [ + { + "if": { + "properties": { + "status": { + "const": "failed" + } + } + }, + "then": { + "properties": { + "error_code": { + "type": "string", + "minLength": 1 + }, + "error_message": { + "type": "string", + "minLength": 1 + } + } + } + } + ] + } + }, + "examples": [ + { + "commandPayloadV1": { + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": "2026-04-03T12:48:10Z", + "expires_at": "2026-04-03T12:52:10Z", + "requested_by": 1, + "reason": "operator_request" + } + }, + { + "commandAckPayloadV1": { + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "status": "execution_started", + "error_code": null, + "error_message": null + } + } + ] +} diff --git a/implementation-plans/reboot-command-payload-schemas.md b/implementation-plans/reboot-command-payload-schemas.md new file mode 100644 index 0000000..7612eed --- /dev/null +++ b/implementation-plans/reboot-command-payload-schemas.md @@ -0,0 +1,59 @@ +## Reboot Command Payload Schema Snippets + +This file provides copy-ready validation snippets for client and integration test helpers. + +### Canonical Topics (v1) +1. Command topic: infoscreen/{client_uuid}/commands +2. Ack topic: infoscreen/{client_uuid}/commands/ack + +### Transitional Compatibility Topics +1. Command topic alias: infoscreen/{client_uuid}/command +2. Ack topic alias: infoscreen/{client_uuid}/command/ack + +### Canonical Action Values +1. reboot_host +2. shutdown_host + +### Ack Status Values +1. accepted +2. execution_started +3. completed +4. failed + +### JSON Schema Source +Use this file for machine validation: +1. implementation-plans/reboot-command-payload-schemas.json + +### Minimal Command Schema Snippet +```json +{ + "type": "object", + "additionalProperties": false, + "required": ["schema_version", "command_id", "client_uuid", "action", "issued_at", "expires_at", "requested_by", "reason"], + "properties": { + "schema_version": { "const": "1.0" }, + "command_id": { "type": "string", "format": "uuid" }, + "client_uuid": { "type": "string", "format": "uuid" }, + "action": { "enum": ["reboot_host", "shutdown_host"] }, + "issued_at": { "type": "string", "format": "date-time" }, + "expires_at": { "type": "string", "format": "date-time" }, + "requested_by": { "type": ["integer", "null"] }, + "reason": { "type": ["string", "null"] } + } +} +``` + +### Minimal Ack Schema Snippet +```json +{ + "type": "object", + "additionalProperties": false, + "required": ["command_id", "status", "error_code", "error_message"], + "properties": { + "command_id": { "type": "string", "format": "uuid" }, + "status": { "enum": ["accepted", "execution_started", "completed", "failed"] }, + "error_code": { "type": ["string", "null"] }, + "error_message": { "type": ["string", "null"] } + } +} +``` diff --git a/implementation-plans/reboot-implementation-handoff-client-team.md b/implementation-plans/reboot-implementation-handoff-client-team.md new file mode 100644 index 0000000..23b7982 --- /dev/null +++ b/implementation-plans/reboot-implementation-handoff-client-team.md @@ -0,0 +1,169 @@ +## Client Team Implementation Spec (Raspberry Pi 5) + +### Mission +Implement client-side command handling for reliable restart and shutdown with strict validation, idempotency, acknowledgements, and reboot recovery continuity. + +### Ownership Boundaries +1. Client team owns command intake, execution, acknowledgement emission, and post-reboot continuity. +2. Platform team owns command issuance, lifecycle aggregation, and server-side escalation logic. +3. Client implementation must not assume managed PoE availability. + +### Required Client Behaviors + +### Frozen MQTT Topics and Schemas (v1) +1. Canonical command topic: infoscreen/{client_uuid}/commands. +2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack. +3. Transitional compatibility topics during migration: +- infoscreen/{client_uuid}/command +- infoscreen/{client_uuid}/command/ack +4. QoS policy: command QoS 1, ack QoS 1 recommended. +5. Retain policy: commands and acks are non-retained. +6. Client migration behavior: subscribe to both command topics and publish to both ack topics during migration. + +Frozen command payload schema: + +```json +{ + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": "2026-04-03T12:48:10Z", + "expires_at": "2026-04-03T12:52:10Z", + "requested_by": 1, + "reason": "operator_request" +} +``` + +Frozen ack payload schema: + +```json +{ + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "status": "execution_started", + "error_code": null, + "error_message": null +} +``` + +Allowed ack status values: +1. accepted +2. execution_started +3. completed +4. failed + +Frozen command action values for v1: +1. reboot_host +2. shutdown_host + +Reserved but not emitted by server in v1: +1. restart_service + +### Client Decision Defaults (v1) +1. Privileged helper invocation: sudoers + local helper script (`sudo /usr/local/bin/infoscreen-cmd-helper.sh`). +2. Dedupe retention: keep processed command IDs for 24 hours and cap store size to 5000 newest entries. +3. Ack retry schedule while broker unavailable: 0.5s, 1s, 2s, 4s, then 5s cap until expires_at. +4. Boot-loop handling: server remains authority for safety lockout; client enforces idempotency by command_id and reports local execution outcomes. + +### MQTT Auth Hardening (Current Priority) +1. Client must support authenticated MQTT connections for both command and event intake. +2. Client must remain compatible with broker ACLs that restrict publish/subscribe rights per topic. +3. Client should support TLS broker connections from environment configuration when certificates are provided. +4. URL/domain allowlisting for web and webuntis events is explicitly deferred and tracked separately in TODO.md. +5. Client credentials are loaded from the local [/.env](.env), not from tracked docs or templates. + +Server-side prerequisites for this client work: +1. Broker credentials must be provisioned for clients. +2. Broker ACLs must allow each client to subscribe only to its own command topics and assigned event topics. +3. Broker ACLs must allow each client to publish only its own ack, heartbeat, health, dashboard, and telemetry topics. +4. Server-side publishers must move to authenticated broker access before production rollout. + +Validation snippets for helper scripts: +1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md +2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json + +### 1. Command Intake +1. Subscribe to canonical and transitional command topics with QoS 1. +2. Parse required fields exactly: schema_version, command_id, client_uuid, action, issued_at, expires_at, requested_by, reason. +3. Reject invalid payloads with failed acknowledgement including error_code and diagnostic message. +4. Reject stale commands when current time exceeds expires_at. +5. Reject already-processed command_id values without re-execution. + +### 2. Idempotency And Persistence +1. Persist processed command_id and execution result on local storage. +2. Persistence must survive service restart and full OS reboot. +3. On restart, reload dedupe cache before processing newly delivered commands. + +### 3. Acknowledgement Contract Behavior +1. Emit accepted immediately after successful validation and dedupe pass. +2. Emit execution_started immediately before invoking the command action. +3. Emit completed only when local success condition is confirmed. +4. Emit failed with structured error_code on validation or execution failure. +5. If MQTT is temporarily unavailable, retry ack publish with bounded backoff until command expiry. +6. Ack payload fields are strict: command_id, status, error_code, error_message (no additional fields). +7. For status failed, error_code and error_message must be non-null, non-empty strings. + +### 4. Execution Security Model +1. Execute via systemd-managed privileged helper. +2. Allow only whitelisted operations: +- reboot_host +- shutdown_host +3. Do not execute restart_service in v1. +4. Disallow arbitrary shell commands and untrusted arguments. +5. Enforce per-command execution timeout and terminate hung child processes. + +### 5. Reboot Recovery Continuity +1. For reboot_host action: +- send execution_started +- trigger reboot promptly +2. During startup: +- emit heartbeat early +- emit process-health once service is ready +3. Keep last command execution state available after reboot for reconciliation. + +### 6. Time And Timeout Semantics +1. Use monotonic timers for local elapsed-time checks. +2. Use UTC wall-clock only for protocol timestamps and expiry comparisons. +3. Target reconnect baseline on Pi 5 USB-SATA SSD: 90 seconds. +4. Accept cold-boot and USB enumeration ceiling up to 150 seconds. + +### 7. Capability Reporting +1. Report recovery capability class: +- software_only +- managed_poe_available +- manual_only +2. Report watchdog enabled status. +3. Report boot-source metadata for diagnostics. + +### 8. Error Codes Minimum Set +1. invalid_schema +2. missing_field +3. stale_command +4. duplicate_command +5. permission_denied_local +6. execution_timeout +7. execution_failed +8. broker_unavailable +9. internal_error + +### Acceptance Tests (Client Team) +1. Invalid schema payload is rejected and failed ack emitted. +2. Expired command is rejected and not executed. +3. Duplicate command_id is not executed twice. +4. reboot_host emits execution_started and reconnects with heartbeat in expected window. +5. shutdown_host action is accepted and invokes local privileged helper without accepting non-whitelisted actions. +6. MQTT outage during ack path retries correctly without duplicate execution. +7. Client idempotency cooperates with server-side lockout semantics (no local reboot-rate policy). +8. Client connects successfully to an authenticated broker and still receives commands and event topics permitted by ACLs. + +### Delivery Artifacts +1. Client protocol conformance checklist. +2. Test evidence for all acceptance tests. +3. Runtime logs showing full lifecycle for one shutdown and one reboot scenario. +4. Known limitations list per image version. + +### Definition Of Done +1. All acceptance tests pass on Pi 5 USB-SATA SSD test devices. +2. No duplicate execution observed under reconnect and retained-delivery edge cases. +3. Acknowledgement sequence is complete and machine-parseable for server correlation. +4. Reboot recovery continuity works without managed PoE dependencies. diff --git a/implementation-plans/reboot-implementation-handoff-share.md b/implementation-plans/reboot-implementation-handoff-share.md new file mode 100644 index 0000000..4793348 --- /dev/null +++ b/implementation-plans/reboot-implementation-handoff-share.md @@ -0,0 +1,175 @@ +## Remote Reboot Reliability Handoff (Share Document) + +### Purpose +This document defines the agreed implementation scope for reliable remote reboot and shutdown of Raspberry Pi 5 clients, with monitoring-first visibility and safe escalation paths. + +### Scope +1. In scope: restart and shutdown command reliability. +2. In scope: full lifecycle monitoring and audit visibility. +3. In scope: capability-tier recovery model with optional managed PoE escalation. +4. Out of scope: broader maintenance module in client-management for this cycle. +5. Out of scope: mandatory dependency on customer-managed power switching. + +### Agreed Operating Model +1. Command delivery is asynchronous and lifecycle-tracked, not fire-and-forget. +2. Commands use idempotent command_id semantics with stale-command rejection by expires_at. +3. Monitoring is authoritative for operational state and escalation decisions. +4. Recovery must function even when no managed power switching is available. + +### Frozen Contract v1 (Effective Immediately) +1. Canonical command topic: infoscreen/{client_uuid}/commands. +2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack. +3. Transitional compatibility topics accepted during migration: +- infoscreen/{client_uuid}/command +- infoscreen/{client_uuid}/command/ack +4. QoS policy: command QoS 1, ack QoS 1 recommended. +5. Retain policy: commands and acks are non-retained. + +Command payload schema (frozen): + +```json +{ + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": "2026-04-03T12:48:10Z", + "expires_at": "2026-04-03T12:52:10Z", + "requested_by": 1, + "reason": "operator_request" +} +``` + +Ack payload schema (frozen): + +```json +{ + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "status": "execution_started", + "error_code": null, + "error_message": null +} +``` + +Allowed ack status values: +1. accepted +2. execution_started +3. completed +4. failed + +Frozen command action values: +1. reboot_host +2. shutdown_host + +API endpoint mapping: +1. POST /api/clients/{uuid}/restart -> action reboot_host +2. POST /api/clients/{uuid}/shutdown -> action shutdown_host + +Validation snippets: +1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md +2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json + +### Command Lifecycle States +1. queued +2. publish_in_progress +3. published +4. ack_received +5. execution_started +6. awaiting_reconnect +7. recovered +8. completed +9. failed +10. expired +11. timed_out +12. canceled +13. blocked_safety +14. manual_intervention_required + +### Timeout Defaults (Pi 5, USB-SATA SSD baseline) +1. queued to publish_in_progress: immediate, timeout 5 seconds. +2. publish_in_progress to published: timeout 8 seconds. +3. published to ack_received: timeout 20 seconds. +4. ack_received to execution_started: 15 seconds for service restart, 25 seconds for host reboot. +5. execution_started to awaiting_reconnect: timeout 10 seconds. +6. awaiting_reconnect to recovered: baseline 90 seconds after validation, cold-boot ceiling 150 seconds. +7. recovered to completed: 15 to 20 seconds based on fleet stability. +8. command expires_at default: 240 seconds, bounded 180 to 360 seconds. + +### Recovery Tiers +1. Tier 0 baseline, always required: watchdog, systemd auto-restart, lifecycle tracking, manual intervention fallback. +2. Tier 1 optional: managed PoE per-port power-cycle escalation where customer infrastructure supports it. +3. Tier 2 no remote power control: direct manual intervention workflow. + +### Governance And Safety +1. Role access: admin and superadmin. +2. Bulk actions require reason capture. +3. Safety lockout: maximum 3 reboot commands per client in 15 minutes. +4. Escalation cooldown: 60 seconds before automatic move to manual_intervention_required. + +### MQTT Auth Hardening (Phase 1, Required Before Broad Rollout) +1. Intranet-only deployment is not sufficient protection for privileged MQTT actions by itself. +2. Phase 1 hardening scope is broker authentication, authorization, and network restriction; payload URL allowlisting is deferred to a later client/server feature. +3. MQTT broker must disable anonymous publish/subscribe access in production. +4. MQTT broker must require authenticated identities for server-side publishers and client devices. +5. MQTT broker must enforce ACLs so that: +- only server-side services can publish to `infoscreen/{client_uuid}/commands` +- only server-side services can publish scheduler event topics +- each client can subscribe only to its own command topics and assigned event topics +- each client can publish only its own ack, heartbeat, health, dashboard, and telemetry topics +6. Broker port exposure must be restricted to the management network and approved hosts only. +7. TLS support is strongly recommended in this phase and should be enabled when operationally feasible. + +### Server Team Actions For Auth Hardening +1. Provision broker credentials for command/event publishers and for client devices. +2. Configure Mosquitto or equivalent broker ACLs for per-topic publish and subscribe restrictions. +3. Disable anonymous access on production brokers. +4. Restrict broker network exposure with firewall rules, VLAN policy, or equivalent network controls. +5. Update server/frontend deployment to publish MQTT with authenticated credentials. +6. Validate that server-side event publishing and reboot/shutdown command publishing still work under the new ACL policy. +7. Coordinate credential distribution and rotation with the client deployment process. + +### Credential Management Guidance +1. Real MQTT passwords must not be stored in tracked documentation or committed templates. +2. Each client device should receive a unique broker username and password, stored only in its local [/.env](.env). +3. Server-side publisher credentials should be stored in the server team's secret-management path, not in source control. +4. Recommended naming convention for client broker users: `infoscreen-client-`. +5. Client passwords should be random, at least 20 characters, and rotated through deployment tooling or broker administration procedures. +6. The server/infrastructure team owns broker-side user creation, ACL assignment, rotation, and revocation. +7. The client team owns loading credentials from local env files and validating connection behavior against the secured broker. + +### Client Team Actions For Auth Hardening +1. Add MQTT username/password support in the client connection setup. +2. Add client-side TLS configuration support from environment when certificates are provided. +3. Update local test helpers to support authenticated MQTT publishing and subscription. +4. Validate command and event intake against the authenticated broker configuration before canary rollout. + +### Ready For Server/Frontend Team (Auth Phase) +1. Client implementation is ready to connect with MQTT auth from local `.env` (`MQTT_USERNAME`, `MQTT_PASSWORD`, optional TLS settings). +2. Client command/event intake and client ack/telemetry publishing run over the authenticated MQTT session. +3. Server/frontend team must now complete broker-side enforcement and publisher migration. + +Server/frontend done criteria: +1. Anonymous broker access is disabled in production. +2. Server-side publishers use authenticated broker credentials. +3. ACLs are active and validated for command, event, and client telemetry topics. +4. At least one canary client proves end-to-end flow under ACLs: +- server publishes command/event with authenticated publisher +- client receives payload +- client sends ack/telemetry successfully +5. Revocation test passes: disabling one client credential blocks only that client without impacting others. + +Operational note: +1. Client-side auth support is necessary but not sufficient by itself; broker ACL/auth enforcement is the security control that must be enabled by the server/infrastructure team. + +### Rollout Plan +1. Contract freeze and sign-off. +2. Platform and client implementation against frozen schemas. +3. One-group canary. +4. Rollback if failed plus timed_out exceeds 5 percent. +5. Expand only after 7 days below intervention threshold. + +### Success Criteria +1. Deterministic command lifecycle visibility from enqueue to completion. +2. No duplicate execution under reconnect or delayed-delivery conditions. +3. Stable Pi 5 SSD reconnect behavior within defined baseline. +4. Clear and actionable manual intervention states when automatic recovery is exhausted. diff --git a/implementation-plans/reboot-kickoff-summary.md b/implementation-plans/reboot-kickoff-summary.md new file mode 100644 index 0000000..c739640 --- /dev/null +++ b/implementation-plans/reboot-kickoff-summary.md @@ -0,0 +1,54 @@ +## Reboot Reliability Kickoff Summary + +### Objective +Ship a reliable, observable restart and shutdown workflow for Raspberry Pi 5 clients, with safe escalation and clear operator outcomes. + +### What Is Included +1. Asynchronous command lifecycle with idempotent command_id handling. +2. Monitoring-first state visibility from queued to terminal outcomes. +3. Client acknowledgements for accepted, execution_started, completed, and failed. +4. Pi 5 USB-SATA SSD timeout baseline and tuning rules. +5. Capability-tier recovery with optional managed PoE escalation. + +### What Is Not Included +1. Full maintenance module in client-management. +2. Required managed power-switch integration. +3. Production Wake-on-LAN rollout. + +### Team Split +1. Platform team: API command lifecycle, safety controls, listener ack ingestion. +2. Web team: lifecycle-aware UX and command status display. +3. Client team: strict validation, dedupe, ack sequence, secure execution helper, reboot continuity. + +### Ownership Matrix +| Team | Primary Plan File | Main Deliverables | +| --- | --- | --- | +| Platform team | implementation-plans/reboot-implementation-handoff-share.md | Command lifecycle backend, policy enforcement, listener ack mapping, safety lockout and escalation | +| Web team | implementation-plans/reboot-implementation-handoff-share.md | Lifecycle UI states, bulk safety UX, capability visibility, command status polling | +| Client team | implementation-plans/reboot-implementation-handoff-client-team.md | Command validation, dedupe persistence, ack sequence, secure execution helper, reboot continuity | +| Project coordination | implementation-plans/reboot-kickoff-summary.md | Phase sequencing, canary gates, rollback thresholds, cross-team sign-off tracking | + +### Baseline Operational Defaults +1. Safety lockout: 3 reboot commands per client in rolling 15 minutes. +2. Escalation cooldown: 60 seconds. +3. Reconnect target on Pi 5 SSD: 90 seconds baseline, 150 seconds cold-boot ceiling. +4. Rollback canary trigger: failed plus timed_out above 5 percent. + +### Frozen Contract Snapshot +1. Canonical command topic: infoscreen/{client_uuid}/commands. +2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack. +3. Transitional compatibility topics during migration: +- infoscreen/{client_uuid}/command +- infoscreen/{client_uuid}/command/ack +4. Command schema version: 1.0. +5. Allowed command actions: reboot_host, shutdown_host. +6. Allowed ack status values: accepted, execution_started, completed, failed. +7. Validation snippets: +- implementation-plans/reboot-command-payload-schemas.md +- implementation-plans/reboot-command-payload-schemas.json + +### Immediate Next Steps +1. Continue implementation in parallel by team against frozen contract. +2. Client team validates dedupe and expiry handling on canonical topics. +3. Platform team verifies ack-state transitions for accepted, execution_started, completed, failed. +4. Execute one-group canary and validate timing plus failure drills. diff --git a/scripts/infoscreen-cmd-helper.sh b/scripts/infoscreen-cmd-helper.sh new file mode 100755 index 0000000..e1edfde --- /dev/null +++ b/scripts/infoscreen-cmd-helper.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Privileged command helper for remote reboot/shutdown actions. +# Intended installation path: /usr/local/bin/infoscreen-cmd-helper.sh +# Suggested sudoers entry: +# infoscreen ALL=(ALL) NOPASSWD: /usr/local/bin/infoscreen-cmd-helper.sh + +if [[ $# -ne 1 ]]; then + echo "usage: infoscreen-cmd-helper.sh " >&2 + exit 2 +fi + +action="$1" + +case "$action" in + reboot_host) + exec systemctl reboot + ;; + shutdown_host) + exec systemctl poweroff + ;; + *) + echo "unsupported action: $action" >&2 + exit 1 + ;; +esac diff --git a/scripts/infoscreen-display.service b/scripts/infoscreen-display.service index b258706..b14e54b 100644 --- a/scripts/infoscreen-display.service +++ b/scripts/infoscreen-display.service @@ -3,6 +3,8 @@ Description=Infoscreen Display Manager Documentation=https://github.com/RobbStarkAustria/infoscreen_client_2025 After=network.target graphical.target Wants=network-online.target +# Publish an MQTT alert if systemd gives up restarting (StartLimitBurst exceeded). +OnFailure=infoscreen-notify-failure@%n.service [Service] Type=simple diff --git a/scripts/infoscreen-notify-failure.sh b/scripts/infoscreen-notify-failure.sh new file mode 100755 index 0000000..7b01ade --- /dev/null +++ b/scripts/infoscreen-notify-failure.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Publishes a service-failed MQTT notification when called by systemd OnFailure=. +# Usage: infoscreen-notify-failure.sh +# +# Designed to be called from infoscreen-notify-failure@.service. +# Reads broker credentials from .env; reads client UUID from config. +# Safe to run even if MQTT is unreachable (exits cleanly, errors logged to journal). + +set -euo pipefail + +FAILING_UNIT="${1:-unknown}" +PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ENV_FILE="$PROJECT_DIR/.env" +UUID_FILE="$PROJECT_DIR/src/config/client_uuid.txt" + +# Load .env (skip comments and blank lines) +if [[ -f "$ENV_FILE" ]]; then + set -a + # shellcheck source=/dev/null + source <(grep -v '^\s*#' "$ENV_FILE" | grep -v '^\s*$') + set +a +fi + +MQTT_BROKER="${MQTT_BROKER:-localhost}" +MQTT_PORT="${MQTT_PORT:-1883}" +MQTT_USER="${MQTT_USER:-}" +MQTT_PASSWORD_BROKER="${MQTT_PASSWORD_BROKER:-}" + +CLIENT_UUID="unknown" +if [[ -f "$UUID_FILE" ]]; then + CLIENT_UUID="$(cat "$UUID_FILE" | tr -d '[:space:]')" +fi + +TOPIC="infoscreen/${CLIENT_UUID}/service_failed" +TIMESTAMP="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + +PAYLOAD=$(printf '{"event":"service_failed","unit":"%s","client_uuid":"%s","failed_at":"%s"}' \ + "$FAILING_UNIT" "$CLIENT_UUID" "$TIMESTAMP") + +# Build mosquitto_pub auth args +AUTH_ARGS=() +if [[ -n "$MQTT_USER" ]]; then AUTH_ARGS+=(-u "$MQTT_USER"); fi +if [[ -n "$MQTT_PASSWORD_BROKER" ]]; then AUTH_ARGS+=(-P "$MQTT_PASSWORD_BROKER"); fi + +echo "Publishing service-failed notification: unit=$FAILING_UNIT client=$CLIENT_UUID" + +mosquitto_pub \ + -h "$MQTT_BROKER" \ + -p "$MQTT_PORT" \ + "${AUTH_ARGS[@]}" \ + -t "$TOPIC" \ + -m "$PAYLOAD" \ + -q 1 \ + --retain \ + 2>&1 || echo "WARNING: mosquitto_pub failed (broker unreachable?); notification not delivered" diff --git a/scripts/infoscreen-notify-failure@.service b/scripts/infoscreen-notify-failure@.service new file mode 100644 index 0000000..98e0863 --- /dev/null +++ b/scripts/infoscreen-notify-failure@.service @@ -0,0 +1,19 @@ +[Unit] +Description=Infoscreen service-failed MQTT notifier (%i) +# One-shot: run once and exit. %i is the failing unit name passed by OnFailure=. +After=network.target + +[Service] +Type=oneshot +User=olafn +Group=olafn +WorkingDirectory=/home/olafn/infoscreen-dev +EnvironmentFile=/home/olafn/infoscreen-dev/.env +ExecStart=/home/olafn/infoscreen-dev/scripts/infoscreen-notify-failure.sh %i + +# Do not restart the notifier itself. +Restart=no + +StandardOutput=journal +StandardError=journal +SyslogIdentifier=infoscreen-notify-failure diff --git a/scripts/infoscreen-simclient.service b/scripts/infoscreen-simclient.service new file mode 100644 index 0000000..a4c1cbe --- /dev/null +++ b/scripts/infoscreen-simclient.service @@ -0,0 +1,55 @@ +[Unit] +Description=Infoscreen Simclient (MQTT communication) +Documentation=https://github.com/RobbStarkAustria/infoscreen_client_2025 +# Simclient needs network before starting — MQTT will fail otherwise. +After=network-online.target +Wants=network-online.target +# Publish an MQTT alert if systemd gives up restarting (StartLimitBurst exceeded). +OnFailure=infoscreen-notify-failure@%n.service + +[Service] +# notify: simclient sends READY=1 via sd_notify once fully initialised. +# WatchdogSec: if WATCHDOG=1 is not sent within this window, systemd kills +# and restarts the process — detects hung/deadlocked main loops. +Type=notify +WatchdogSec=60 +User=olafn +Group=olafn +WorkingDirectory=/home/olafn/infoscreen-dev + +# Load all client configuration from the local .env file. +# Keep .env mode 600; systemd reads it as root before dropping privileges. +EnvironmentFile=/home/olafn/infoscreen-dev/.env + +# Start simclient +ExecStart=/home/olafn/infoscreen-dev/scripts/start-simclient.sh + +# Restart on failure (non-zero exit or signal). +# This covers crash recovery AND the reboot-command lifecycle: +# 1. Server sends reboot_host command +# 2. Simclient publishes accepted + execution_started, then exits +# 3. Systemd restarts simclient within RestartSec seconds +# 4. On reconnect, heartbeat loop detects pending_recovery_command and +# publishes completed — closing the lifecycle cleanly. +Restart=on-failure +RestartSec=10 + +# Prevent rapid restart thrash: allow at most 5 restarts in 60 seconds. +StartLimitIntervalSec=60 +StartLimitBurst=5 + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=infoscreen-simclient + +# Security settings +NoNewPrivileges=true +PrivateTmp=true + +# Resource limits +LimitNOFILE=65536 + +[Install] +# Simclient runs in multi-user mode — no graphical session required. +WantedBy=multi-user.target diff --git a/scripts/install-command-helper.sh b/scripts/install-command-helper.sh new file mode 100755 index 0000000..856e1d4 --- /dev/null +++ b/scripts/install-command-helper.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Installs the privileged command helper and sudoers drop-in. +# Usage: ./scripts/install-command-helper.sh [linux-user] + +target_user="${1:-$USER}" +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +helper_src="$script_dir/infoscreen-cmd-helper.sh" +helper_dst="/usr/local/bin/infoscreen-cmd-helper.sh" +sudoers_file="/etc/sudoers.d/infoscreen-command-helper" + +if [[ ! -f "$helper_src" ]]; then + echo "helper source not found: $helper_src" >&2 + exit 1 +fi + +sudo install -m 0755 "$helper_src" "$helper_dst" +printf '%s\n' "$target_user ALL=(ALL) NOPASSWD: $helper_dst" | sudo tee "$sudoers_file" >/dev/null +sudo chmod 0440 "$sudoers_file" +sudo visudo -cf "$sudoers_file" >/dev/null + +echo "Installed helper: $helper_dst" +echo "Installed sudoers: $sudoers_file (user: $target_user)" diff --git a/scripts/mock-command-helper.sh b/scripts/mock-command-helper.sh new file mode 100755 index 0000000..46755cf --- /dev/null +++ b/scripts/mock-command-helper.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Non-destructive helper for command lifecycle canary tests. +# Use by starting simclient with: +# COMMAND_HELPER_PATH=/home/olafn/infoscreen-dev/scripts/mock-command-helper.sh + +if [[ $# -ne 1 ]]; then + echo "usage: mock-command-helper.sh " >&2 + exit 2 +fi + +action="$1" + +case "$action" in + reboot_host|shutdown_host) + ;; + *) + echo "unsupported action: $action" >&2 + exit 1 + ;; +esac + +if [[ "${MOCK_COMMAND_HELPER_FORCE_FAIL:-0}" == "1" ]]; then + echo "forced failure for canary test (action=$action)" >&2 + exit 1 +fi + +if [[ "${MOCK_COMMAND_HELPER_SLEEP_SEC:-0}" != "0" ]]; then + sleep "${MOCK_COMMAND_HELPER_SLEEP_SEC}" +fi + +echo "mock helper executed action=$action" +exit 0 diff --git a/scripts/start-simclient.sh b/scripts/start-simclient.sh new file mode 100755 index 0000000..2d4a4e0 --- /dev/null +++ b/scripts/start-simclient.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Start Simclient - MQTT communication and event intake for infoscreen + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" +VENV_PATH="$PROJECT_ROOT/venv" +SIMCLIENT="$PROJECT_ROOT/src/simclient.py" + +echo "📡 Starting Simclient..." +echo "Project root: $PROJECT_ROOT" + +# Check if virtual environment exists +if [ ! -d "$VENV_PATH" ]; then + echo "❌ Virtual environment not found at: $VENV_PATH" + echo "Please create it with: python3 -m venv venv" + exit 1 +fi + +# Activate virtual environment +source "$VENV_PATH/bin/activate" + +# Check if simclient.py exists +if [ ! -f "$SIMCLIENT" ]; then + echo "❌ Simclient not found at: $SIMCLIENT" + exit 1 +fi + +ENV="${ENV:-development}" +echo "Environment: $ENV" + +echo "Starting simclient..." +echo "---" +exec python3 "$SIMCLIENT" diff --git a/scripts/test-mqtt.sh b/scripts/test-mqtt.sh index 8e35d15..ff7a949 100755 --- a/scripts/test-mqtt.sh +++ b/scripts/test-mqtt.sh @@ -1,9 +1,27 @@ #!/bin/bash source "$(dirname "$0")/../.env" +MQTT_AUTH_ARGS=() +MQTT_TLS_ARGS=() + +if [[ -n "${MQTT_USERNAME:-}" ]]; then + MQTT_AUTH_ARGS+=( -u "$MQTT_USERNAME" ) +fi +if [[ -n "${MQTT_PASSWORD:-}" ]]; then + MQTT_AUTH_ARGS+=( -P "$MQTT_PASSWORD" ) +fi +if [[ "${MQTT_TLS_ENABLED:-0}" == "1" || "${MQTT_TLS_ENABLED:-0}" == "true" || "${MQTT_TLS_ENABLED:-0}" == "yes" ]]; then + [[ -n "${MQTT_TLS_CA_CERT:-}" ]] && MQTT_TLS_ARGS+=( --cafile "$MQTT_TLS_CA_CERT" ) + [[ -n "${MQTT_TLS_CERT:-}" ]] && MQTT_TLS_ARGS+=( --cert "$MQTT_TLS_CERT" ) + [[ -n "${MQTT_TLS_KEY:-}" ]] && MQTT_TLS_ARGS+=( --key "$MQTT_TLS_KEY" ) + if [[ "${MQTT_TLS_INSECURE:-0}" == "1" || "${MQTT_TLS_INSECURE:-0}" == "true" || "${MQTT_TLS_INSECURE:-0}" == "yes" ]]; then + MQTT_TLS_ARGS+=( --insecure ) + fi +fi + echo "Testing MQTT connection to $MQTT_BROKER:$MQTT_PORT" echo "Publishing test message..." -mosquitto_pub -h "$MQTT_BROKER" -p "$MQTT_PORT" -t "infoscreen/test" -m "Hello from Pi development setup" +mosquitto_pub -h "$MQTT_BROKER" -p "$MQTT_PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "infoscreen/test" -m "Hello from Pi development setup" echo "Subscribing to test topic (press Ctrl+C to stop)..." -mosquitto_sub -h "$MQTT_BROKER" -p "$MQTT_PORT" -t "infoscreen/test" +mosquitto_sub -h "$MQTT_BROKER" -p "$MQTT_PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "infoscreen/test" diff --git a/scripts/test-power-intent.sh b/scripts/test-power-intent.sh index 3ab2f5d..65881bf 100755 --- a/scripts/test-power-intent.sh +++ b/scripts/test-power-intent.sh @@ -36,6 +36,31 @@ fi BROKER="${MQTT_BROKER:-localhost}" PORT="${MQTT_PORT:-1883}" +MQTT_USERNAME="${MQTT_USERNAME:-}" +MQTT_PASSWORD="${MQTT_PASSWORD:-}" +MQTT_TLS_ENABLED="${MQTT_TLS_ENABLED:-0}" +MQTT_TLS_CA_CERT="${MQTT_TLS_CA_CERT:-}" +MQTT_TLS_CERT="${MQTT_TLS_CERT:-}" +MQTT_TLS_KEY="${MQTT_TLS_KEY:-}" +MQTT_TLS_INSECURE="${MQTT_TLS_INSECURE:-0}" + +MQTT_AUTH_ARGS=() +MQTT_TLS_ARGS=() + +if [[ -n "$MQTT_USERNAME" ]]; then + MQTT_AUTH_ARGS+=( -u "$MQTT_USERNAME" ) +fi +if [[ -n "$MQTT_PASSWORD" ]]; then + MQTT_AUTH_ARGS+=( -P "$MQTT_PASSWORD" ) +fi +if [[ "$MQTT_TLS_ENABLED" == "1" || "$MQTT_TLS_ENABLED" == "true" || "$MQTT_TLS_ENABLED" == "yes" ]]; then + [[ -n "$MQTT_TLS_CA_CERT" ]] && MQTT_TLS_ARGS+=( --cafile "$MQTT_TLS_CA_CERT" ) + [[ -n "$MQTT_TLS_CERT" ]] && MQTT_TLS_ARGS+=( --cert "$MQTT_TLS_CERT" ) + [[ -n "$MQTT_TLS_KEY" ]] && MQTT_TLS_ARGS+=( --key "$MQTT_TLS_KEY" ) + if [[ "$MQTT_TLS_INSECURE" == "1" || "$MQTT_TLS_INSECURE" == "true" || "$MQTT_TLS_INSECURE" == "yes" ]]; then + MQTT_TLS_ARGS+=( --insecure ) + fi +fi # ── Read runtime IDs ───────────────────────────────────────────────────────── GROUP_ID_FILE="$PROJECT_ROOT/src/config/last_group_id.txt" @@ -107,7 +132,7 @@ EOF echo -e "${YELLOW}Publishing to: $topic${NC}" echo "$payload" | python3 -m json.tool 2>/dev/null || echo "$payload" echo "" - mosquitto_pub -h "$BROKER" -p "$PORT" -t "$topic" -q 1 --retain -m "$payload" + mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$topic" -q 1 --retain -m "$payload" echo -e "${GREEN}Published (retained, QoS 1)${NC}" echo "intent_id: $intent_id" } @@ -119,7 +144,7 @@ clear_intent() { echo -e "${RED}No group_id found.${NC}" return 1 fi - mosquitto_pub -h "$BROKER" -p "$PORT" -t "$topic" -q 1 --retain --null-message + mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$topic" -q 1 --retain --null-message echo -e "${GREEN}Retained intent cleared from broker${NC}" } @@ -167,7 +192,7 @@ subscribe_power_state() { echo -e "${YELLOW}Subscribing to: $topic${NC}" echo "(Ctrl-C to stop)" echo "" - mosquitto_sub -h "$BROKER" -p "$PORT" -t "$topic" | \ + mosquitto_sub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$topic" | \ python3 -c " import sys, json for line in sys.stdin: @@ -216,7 +241,7 @@ while true; do echo -e "${RED}No group_id.${NC}" else TOPIC="$(group_topic)" - mosquitto_pub -h "$BROKER" -p "$PORT" -t "$TOPIC" -q 1 --retain \ + mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$TOPIC" -q 1 --retain \ -m '{"schema_version":"1.0","desired_state":"on"}' echo -e "${YELLOW}⚠ Malformed intent published - client must reject with 'missing required field' in log${NC}" fi diff --git a/scripts/test-reboot-command.sh b/scripts/test-reboot-command.sh new file mode 100755 index 0000000..cf535a2 --- /dev/null +++ b/scripts/test-reboot-command.sh @@ -0,0 +1,244 @@ +#!/usr/bin/env bash +# Safe end-to-end command lifecycle canary for reboot/shutdown contract v1. +# Verifies ack flow: accepted -> execution_started -> completed/failed. + +set -euo pipefail + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +BLUE='\033[0;34m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' + +ENV_FILE="$PROJECT_ROOT/.env" +if [[ -f "$ENV_FILE" ]]; then + while IFS='=' read -r key value; do + key="${key//[$'\t\r\n']}" + key="$(echo "$key" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + [[ -z "$key" ]] && continue + [[ "$key" =~ ^# ]] && continue + [[ "$key" =~ ^[A-Z_][A-Z0-9_]*$ ]] || continue + value="${value%%#*}" + value="${value//[$'\t\r\n']}" + value="$(echo "$value" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + export "$key=$value" + done < "$ENV_FILE" +fi + +BROKER="${MQTT_BROKER:-localhost}" +PORT="${MQTT_PORT:-1883}" +MQTT_USERNAME="${MQTT_USERNAME:-}" +MQTT_PASSWORD="${MQTT_PASSWORD:-}" +MQTT_TLS_ENABLED="${MQTT_TLS_ENABLED:-0}" +MQTT_TLS_CA_CERT="${MQTT_TLS_CA_CERT:-}" +MQTT_TLS_CERT="${MQTT_TLS_CERT:-}" +MQTT_TLS_KEY="${MQTT_TLS_KEY:-}" +MQTT_TLS_INSECURE="${MQTT_TLS_INSECURE:-0}" +CLIENT_UUID_FILE="$PROJECT_ROOT/src/config/client_uuid.txt" +LAST_COMMAND_STATE_FILE="$PROJECT_ROOT/src/config/last_command_state.json" + +if [[ ! -f "$CLIENT_UUID_FILE" ]]; then + echo -e "${RED}client UUID file missing: $CLIENT_UUID_FILE${NC}" + exit 1 +fi + +if ! command -v mosquitto_pub >/dev/null 2>&1 || ! command -v mosquitto_sub >/dev/null 2>&1; then + echo -e "${RED}mosquitto_pub/sub not found. Install mosquitto-clients.${NC}" + exit 1 +fi + +CLIENT_UUID="$(tr -d '[:space:]' < "$CLIENT_UUID_FILE")" +COMMAND_TOPIC="infoscreen/${CLIENT_UUID}/commands" +COMMAND_TOPIC_ALIAS="infoscreen/${CLIENT_UUID}/command" +ACK_TOPIC="infoscreen/${CLIENT_UUID}/commands/ack" +ACK_TOPIC_ALIAS="infoscreen/${CLIENT_UUID}/command/ack" + +MQTT_AUTH_ARGS=() +MQTT_TLS_ARGS=() + +if [[ -n "$MQTT_USERNAME" ]]; then + MQTT_AUTH_ARGS+=( -u "$MQTT_USERNAME" ) +fi +if [[ -n "$MQTT_PASSWORD" ]]; then + MQTT_AUTH_ARGS+=( -P "$MQTT_PASSWORD" ) +fi +if [[ "$MQTT_TLS_ENABLED" == "1" || "$MQTT_TLS_ENABLED" == "true" || "$MQTT_TLS_ENABLED" == "yes" ]]; then + [[ -n "$MQTT_TLS_CA_CERT" ]] && MQTT_TLS_ARGS+=( --cafile "$MQTT_TLS_CA_CERT" ) + [[ -n "$MQTT_TLS_CERT" ]] && MQTT_TLS_ARGS+=( --cert "$MQTT_TLS_CERT" ) + [[ -n "$MQTT_TLS_KEY" ]] && MQTT_TLS_ARGS+=( --key "$MQTT_TLS_KEY" ) + if [[ "$MQTT_TLS_INSECURE" == "1" || "$MQTT_TLS_INSECURE" == "true" || "$MQTT_TLS_INSECURE" == "yes" ]]; then + MQTT_TLS_ARGS+=( --insecure ) + fi +fi + +ACTION="${1:-reboot_host}" +MODE="${2:-success}" # success | failed +TOPIC_MODE="${3:-canonical}" # canonical | alias +WAIT_SEC="${4:-25}" + +if [[ "$ACTION" != "reboot_host" && "$ACTION" != "shutdown_host" ]]; then + echo -e "${RED}invalid action '$ACTION' (expected reboot_host|shutdown_host)${NC}" + exit 1 +fi +if [[ "$MODE" != "success" && "$MODE" != "failed" ]]; then + echo -e "${RED}invalid mode '$MODE' (expected success|failed)${NC}" + exit 1 +fi +if [[ "$TOPIC_MODE" != "canonical" && "$TOPIC_MODE" != "alias" ]]; then + echo -e "${RED}invalid topic mode '$TOPIC_MODE' (expected canonical|alias)${NC}" + exit 1 +fi +if ! [[ "$WAIT_SEC" =~ ^[0-9]+$ ]] || [[ "$WAIT_SEC" -lt 1 ]]; then + echo -e "${RED}invalid wait seconds '$WAIT_SEC' (expected positive integer)${NC}" + exit 1 +fi + +if [[ "$TOPIC_MODE" == "alias" ]]; then + COMMAND_TOPIC="$COMMAND_TOPIC_ALIAS" +fi + +COMMAND_ID="$(python3 - <<'PY' +import uuid +print(uuid.uuid4()) +PY +)" +ISSUED_AT="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" +EXPIRES_EPOCH="$(( $(date +%s) + 240 ))" +EXPIRES_AT="$(date -u -d "@$EXPIRES_EPOCH" +"%Y-%m-%dT%H:%M:%SZ")" + +PAYLOAD="$(cat </dev/null 2>&1 || true + [[ -n "${SUB_PID_2:-}" ]] && kill "$SUB_PID_2" >/dev/null 2>&1 || true + rm -f "$TMP_ACK_LOG" +} +trap cleanup EXIT + +echo -e "${BLUE}================================================${NC}" +echo -e "${BLUE}Command Lifecycle Canary${NC}" +echo -e "${BLUE}================================================${NC}" +echo " Broker : $BROKER:$PORT" +echo " Client UUID : $CLIENT_UUID" +echo " Command ID : $COMMAND_ID" +echo " Action : $ACTION" +echo " Mode : $MODE" +echo " Cmd Topic : $COMMAND_TOPIC" +echo " Ack Topics : $ACK_TOPIC , $ACK_TOPIC_ALIAS" +echo "" +echo -e "${YELLOW}IMPORTANT${NC}: to avoid real reboot/shutdown, run simclient with" +echo " COMMAND_HELPER_PATH=$PROJECT_ROOT/scripts/mock-command-helper.sh" +echo "" + +# Subscribe first to avoid missing retained/non-retained race windows. +mosquitto_sub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -q 1 -v -t "$ACK_TOPIC" >> "$TMP_ACK_LOG" & +SUB_PID_1=$! +mosquitto_sub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -q 1 -v -t "$ACK_TOPIC_ALIAS" >> "$TMP_ACK_LOG" & +SUB_PID_2=$! +sleep 0.5 + +if [[ "$MODE" == "failed" ]]; then + echo -e "${YELLOW}If simclient was started with MOCK_COMMAND_HELPER_FORCE_FAIL=1, expected terminal status is failed.${NC}" +fi + +echo -e "${YELLOW}Publishing command payload...${NC}" +mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -q 1 -t "$COMMAND_TOPIC" -m "$PAYLOAD" + +EXPECTED_TERMINAL="completed" +if [[ "$MODE" == "failed" ]]; then + EXPECTED_TERMINAL="failed" +fi + +EXPECT_RECOVERY_COMPLETION=0 +if [[ "$ACTION" == "reboot_host" && "$MODE" == "success" ]]; then + EXPECTED_TERMINAL="" + EXPECT_RECOVERY_COMPLETION=1 +fi + +DEADLINE=$(( $(date +%s) + WAIT_SEC )) +SEEN_ACCEPTED=0 +SEEN_STARTED=0 +SEEN_TERMINAL=0 + +while [[ $(date +%s) -lt $DEADLINE ]]; do + if grep -q '"command_id"' "$TMP_ACK_LOG" 2>/dev/null; then + if grep -q "\"command_id\": \"$COMMAND_ID\"" "$TMP_ACK_LOG"; then + grep -q '"status": "accepted"' "$TMP_ACK_LOG" && SEEN_ACCEPTED=1 || true + grep -q '"status": "execution_started"' "$TMP_ACK_LOG" && SEEN_STARTED=1 || true + if [[ -n "$EXPECTED_TERMINAL" ]]; then + grep -q "\"status\": \"$EXPECTED_TERMINAL\"" "$TMP_ACK_LOG" && SEEN_TERMINAL=1 || true + fi + fi + fi + + if [[ $SEEN_ACCEPTED -eq 1 && $SEEN_STARTED -eq 1 ]]; then + if [[ -z "$EXPECTED_TERMINAL" || $SEEN_TERMINAL -eq 1 ]]; then + break + fi + fi + sleep 1 +done + +echo "" +echo -e "${BLUE}Ack stream (filtered by command_id):${NC}" +python3 - <<'PY' "$TMP_ACK_LOG" "$COMMAND_ID" +import json +import sys + +path, command_id = sys.argv[1], sys.argv[2] +with open(path, "r", encoding="utf-8", errors="ignore") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split(" ", 1) + payload = parts[1] if len(parts) == 2 else parts[0] + try: + obj = json.loads(payload) + except Exception: + continue + if obj.get("command_id") == command_id: + print(json.dumps(obj, indent=2)) +PY + +if [[ $SEEN_ACCEPTED -eq 1 && $SEEN_STARTED -eq 1 ]]; then + if [[ -z "$EXPECTED_TERMINAL" ]]; then + echo -e "${GREEN}PASS${NC}: observed accepted -> execution_started" + echo -e "${YELLOW}NOTE${NC}: completed for reboot_host is expected only after client reconnect/recovery." + elif [[ $SEEN_TERMINAL -eq 1 ]]; then + echo -e "${GREEN}PASS${NC}: observed accepted -> execution_started -> $EXPECTED_TERMINAL" + else + echo -e "${RED}FAIL${NC}: missing expected terminal state $EXPECTED_TERMINAL for command_id=$COMMAND_ID" + exit 1 + fi +else + echo -e "${RED}FAIL${NC}: missing expected lifecycle states for command_id=$COMMAND_ID" + if [[ -n "$EXPECTED_TERMINAL" ]]; then + echo " observed: accepted=$SEEN_ACCEPTED execution_started=$SEEN_STARTED terminal($EXPECTED_TERMINAL)=$SEEN_TERMINAL" + else + echo " observed: accepted=$SEEN_ACCEPTED execution_started=$SEEN_STARTED" + fi + exit 1 +fi + +if [[ -f "$LAST_COMMAND_STATE_FILE" ]]; then + echo "" + echo -e "${BLUE}Last command state:${NC}" + python3 -m json.tool "$LAST_COMMAND_STATE_FILE" || cat "$LAST_COMMAND_STATE_FILE" +fi diff --git a/src/.env.production.template b/src/.env.production.template index 45a4a47..1f80d09 100644 --- a/src/.env.production.template +++ b/src/.env.production.template @@ -8,6 +8,13 @@ VERSION=latest # MQTT Broker MQTT_BROKER=192.168.1.100 MQTT_PORT=1883 +MQTT_USERNAME=infoscreen-client- +MQTT_PASSWORD= +MQTT_TLS_ENABLED=0 +# MQTT_TLS_CA_CERT=/etc/infoscreen/mqtt/ca.crt +# MQTT_TLS_CERT=/etc/infoscreen/mqtt/client.crt +# MQTT_TLS_KEY=/etc/infoscreen/mqtt/client.key +# MQTT_TLS_INSECURE=0 # Timing (production values) HEARTBEAT_INTERVAL=60 diff --git a/src/.env.template b/src/.env.template index 36c3c46..cf5d5e7 100644 --- a/src/.env.template +++ b/src/.env.template @@ -9,6 +9,13 @@ LOG_LEVEL=DEBUG # MQTT Broker Configuration MQTT_BROKER=192.168.1.100 # Change to your MQTT server IP MQTT_PORT=1883 +MQTT_USERNAME=infoscreen-client- +MQTT_PASSWORD= +MQTT_TLS_ENABLED=0 +# MQTT_TLS_CA_CERT=/etc/infoscreen/mqtt/ca.crt +# MQTT_TLS_CERT=/etc/infoscreen/mqtt/client.crt +# MQTT_TLS_KEY=/etc/infoscreen/mqtt/client.key +# MQTT_TLS_INSECURE=0 # Timing Configuration (shorter intervals for development) HEARTBEAT_INTERVAL=10 # Heartbeat frequency in seconds diff --git a/src/README.md b/src/README.md index 3cd94ba..d7b3655 100644 --- a/src/README.md +++ b/src/README.md @@ -22,23 +22,52 @@ Primary runtime flow: ## Key Files - `display_manager.py`: display lifecycle, HDMI-CEC, screenshots, local fallback logic. -- `simclient.py`: MQTT callbacks, event persistence, dashboard publishing, power-intent validation. +- `simclient.py`: MQTT callbacks, event persistence, dashboard publishing, power-intent validation, command intake. - `current_event.json`: active event state consumed by the display manager. - `current_process_health.json`: local health bridge for monitoring. - `power_intent_state.json`: latest validated power intent from MQTT. - `power_state.json`: latest applied power action telemetry. - `screenshots/meta.json`: screenshot metadata used by the dashboard path. +- `../scripts/start-simclient.sh`: launcher for `simclient.py` (used by the systemd unit). +- `../scripts/start-display-manager.sh`: launcher for `display_manager.py`. +- `../scripts/infoscreen-simclient.service`: systemd unit for `simclient.py`. +- `../scripts/infoscreen-display.service`: systemd unit for `display_manager.py`. +- `../scripts/infoscreen-notify-failure@.service`: systemd template unit; fires on `OnFailure=`. +- `../scripts/infoscreen-notify-failure.sh`: publishes `service_failed` MQTT alert when a unit gives up. ## Developer Workflow -Typical local workflow: +On deployed devices, both processes are managed by systemd: + +```bash +# Start / stop / restart +sudo systemctl start infoscreen-simclient infoscreen-display +sudo systemctl restart infoscreen-simclient infoscreen-display + +# Follow logs +journalctl -u infoscreen-simclient -u infoscreen-display -f +``` + +First-time systemd setup: + +```bash +sudo cp scripts/infoscreen-simclient.service /etc/systemd/system/ +sudo cp scripts/infoscreen-display.service /etc/systemd/system/ +sudo cp scripts/infoscreen-notify-failure@.service /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl enable infoscreen-simclient infoscreen-display +``` + +Or run `src/pi-setup.sh` which includes the above as step 14. + +For local development without systemd: ```bash cd ~/infoscreen-dev source venv/bin/activate # Terminal 1 -./scripts/start-dev.sh +./scripts/start-simclient.sh # Terminal 2 ./scripts/start-display-manager.sh @@ -51,6 +80,7 @@ Useful helpers: - `./scripts/test-mqtt.sh` - `./scripts/test-screenshot.sh` - `./scripts/test-power-intent.sh` +- `./scripts/test-progress-bars.sh` ## MQTT Topics @@ -59,8 +89,11 @@ Useful helpers: - `infoscreen/discovery` - `infoscreen/{client_id}/heartbeat` - `infoscreen/{client_id}/dashboard` -- `infoscreen/{client_id}/health` +- `infoscreen/{client_id}/health` — includes `broker_connection` block with `reconnect_count`, `last_disconnect_at` - `infoscreen/{client_id}/power/state` +- `infoscreen/{client_id}/commands/ack` — command acknowledgement (states: `accepted`, `rejected`, `execution_started`, `completed`, `failed`) +- `infoscreen/{client_id}/command/ack` — legacy ack topic (also published for compatibility) +- `infoscreen/{client_id}/service_failed` — retained alert published by `infoscreen-notify-failure.sh` when systemd gives up restarting a unit ### Server → Client @@ -68,6 +101,7 @@ Useful helpers: - `infoscreen/{client_id}/group_id` - `infoscreen/events/{group_id}` - `infoscreen/groups/{group_id}/power/intent` +- `infoscreen/{client_id}/commands` — remote command intake (`reboot`, `shutdown`) ## Event and Display Notes @@ -118,6 +152,8 @@ cat ~/infoscreen-dev/src/screenshots/meta.json - `ENV=development` disables HDMI-CEC in the display manager. - `POWER_CONTROL_MODE` controls local vs hybrid vs mqtt power behavior. +- `COMMAND_HELPER_PATH` points to the shell script that executes privileged commands (reboot/shutdown). Use `mock-command-helper.sh` for local testing. +- `COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE=1` makes a mock reboot complete immediately instead of waiting for process restart. Only works when the helper basename is `mock-command-helper.sh`. - File download host rewriting is handled in `simclient.py` using `FILE_SERVER_*` settings. ## Related Documents diff --git a/src/pi-setup.sh b/src/pi-setup.sh index 9d3d877..cefd90f 100644 --- a/src/pi-setup.sh +++ b/src/pi-setup.sh @@ -125,13 +125,54 @@ EOF chmod +x "$PROJECT_DIR/scripts/start-dev.sh" log_ok "start-dev.sh created" -# 13. SSH enable (for remote dev) +# 13. Install command helper + sudoers rule for reboot/shutdown command execution +log_step "Installing command helper and sudoers policy..." +HELPER_SRC="$PROJECT_DIR/scripts/infoscreen-cmd-helper.sh" +HELPER_DST="/usr/local/bin/infoscreen-cmd-helper.sh" +SUDOERS_FILE="/etc/sudoers.d/infoscreen-command-helper" + +if [ -f "$HELPER_SRC" ]; then + sudo install -m 0755 "$HELPER_SRC" "$HELPER_DST" + echo "$USER ALL=(ALL) NOPASSWD: $HELPER_DST" | sudo tee "$SUDOERS_FILE" >/dev/null + sudo chmod 0440 "$SUDOERS_FILE" + sudo visudo -cf "$SUDOERS_FILE" >/dev/null + log_ok "Command helper installed at $HELPER_DST and sudoers rule validated" +else + log_warn "Command helper source not found: $HELPER_SRC" +fi + +# 14. Systemd service units (simclient + display manager) +log_step "Installing systemd service units..." +SERVICES_SRC="$PROJECT_DIR/scripts" +SYSTEMD_DIR="/etc/systemd/system" + +for unit in infoscreen-simclient.service infoscreen-display.service "infoscreen-notify-failure@.service"; do + if [ -f "$SERVICES_SRC/$unit" ]; then + sudo cp "$SERVICES_SRC/$unit" "$SYSTEMD_DIR/$unit" + log_ok "Installed $SYSTEMD_DIR/$unit" + else + log_warn "Service unit not found: $SERVICES_SRC/$unit" + fi +done + +sudo systemctl daemon-reload + +for unit in infoscreen-simclient.service infoscreen-display.service; do + if [ -f "$SYSTEMD_DIR/$unit" ]; then + sudo systemctl enable "$unit" >/dev/null 2>&1 || true + log_ok "Enabled: $unit" + fi +done + +log_ok "Systemd units installed. Start with: sudo systemctl start infoscreen-simclient infoscreen-display" + +# 15. SSH enable (for remote dev) log_step "Ensuring SSH service enabled..." sudo systemctl enable ssh >/dev/null 2>&1 || true sudo systemctl start ssh >/dev/null 2>&1 || true log_ok "SSH service active" -# 14. Summary +# 16. Summary echo "" echo -e "${GREEN}🎉 Setup complete!${NC}" echo "Project: $PROJECT_DIR" diff --git a/src/simclient.py b/src/simclient.py index 04119f8..7cb6847 100644 --- a/src/simclient.py +++ b/src/simclient.py @@ -7,18 +7,39 @@ import json import socket import hashlib import paho.mqtt.client as mqtt +import ssl import os import shutil import re import platform import logging +import subprocess from dotenv import load_dotenv import requests import base64 -from datetime import datetime, timezone +from datetime import datetime, timezone, timedelta import threading from urllib.parse import urlsplit, urlunsplit, unquote + +def _sd_notify(msg: str) -> None: + """Send a sd_notify message to systemd via NOTIFY_SOCKET. + + Uses raw socket so no extra package (systemd-python) is required. + Safe to call when not running under systemd (NOTIFY_SOCKET unset). + """ + sock_path = os.environ.get("NOTIFY_SOCKET") + if not sock_path: + return + try: + addr = sock_path.lstrip("@") # abstract namespace sockets start with '@' + with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as s: + s.connect(addr) + s.sendall(msg.encode()) + except Exception: + pass # never crash the app over a watchdog notification + + # ENV laden - support both container and native development env_paths = [ "/workspace/simclient/.env", # Container path @@ -96,6 +117,18 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG" if ENV == "development" else "INFO") MQTT_BROKER = _env_host("MQTT_BROKER", "localhost" if ENV == "development" else "mqtt") MQTT_PORT = _env_int("MQTT_PORT", 1883) DEBUG_MODE = _env_bool("DEBUG_MODE", ENV == "development") +MQTT_USER = _env_str_clean("MQTT_USER", "") +MQTT_PASSWORD_BROKER = _env_str_clean("MQTT_PASSWORD_BROKER", "") +MQTT_USERNAME = _env_str_clean("MQTT_USERNAME", "") +MQTT_PASSWORD = _env_str_clean("MQTT_PASSWORD", "") +MQTT_TLS_CA_CERT = _env_str_clean("MQTT_TLS_CA_CERT", "") +MQTT_TLS_CERT = _env_str_clean("MQTT_TLS_CERT", "") +MQTT_TLS_KEY = _env_str_clean("MQTT_TLS_KEY", "") +MQTT_TLS_INSECURE = _env_bool("MQTT_TLS_INSECURE", False) +MQTT_TLS_ENABLED = _env_bool( + "MQTT_TLS_ENABLED", + bool(MQTT_TLS_CA_CERT or MQTT_TLS_CERT or MQTT_TLS_KEY), +) MQTT_BROKER_FALLBACKS = [] _fallbacks_raw = os.getenv("MQTT_BROKER_FALLBACKS", "") if _fallbacks_raw: @@ -150,6 +183,48 @@ SCREENSHOT_META_FILE = os.path.join(SCREENSHOT_DIR, "meta.json") POWER_CONTROL_MODE = os.getenv("POWER_CONTROL_MODE", "local").strip().lower() POWER_INTENT_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_intent_state.json") POWER_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_state.json") +COMMAND_STATE_DIR = os.path.join(os.path.dirname(__file__), "config") +PROCESSED_COMMANDS_FILE = os.path.join(COMMAND_STATE_DIR, "processed_commands.json") +LAST_COMMAND_STATE_FILE = os.path.join(COMMAND_STATE_DIR, "last_command_state.json") +COMMAND_HELPER_PATH = os.getenv("COMMAND_HELPER_PATH", "/usr/local/bin/infoscreen-cmd-helper.sh") +COMMAND_EXEC_TIMEOUT_SEC = _env_int("COMMAND_EXEC_TIMEOUT_SEC", 15) +COMMAND_DEDUPE_TTL_HOURS = _env_int("COMMAND_DEDUPE_TTL_HOURS", 24) +COMMAND_DEDUPE_MAX_ENTRIES = _env_int("COMMAND_DEDUPE_MAX_ENTRIES", 5000) +COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE = _env_bool("COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", False) + +NIL_COMMAND_ID = "00000000-0000-0000-0000-000000000000" +COMMAND_ACTIONS = ("reboot_host", "shutdown_host") +ACK_STATUSES = ("accepted", "execution_started", "completed", "failed") +COMMAND_ERROR_CODES = { + "invalid_schema", + "missing_field", + "stale_command", + "duplicate_command", + "permission_denied_local", + "execution_timeout", + "execution_failed", + "broker_unavailable", + "internal_error", +} + + +def command_requires_recovery_completion(action): + return action == "reboot_host" + + +def command_mock_reboot_immediate_complete_enabled(action): + if action != "reboot_host" or not COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE: + return False + + helper_basename = os.path.basename((COMMAND_HELPER_PATH or "").strip()) + if helper_basename == "mock-command-helper.sh": + return True + + logging.warning( + "Ignoring COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE because helper is not mock: %s", + COMMAND_HELPER_PATH, + ) + return False discovered = False @@ -361,6 +436,246 @@ def write_power_intent_state(data): logging.error(f"Error writing power intent state: {e}") +def _atomic_write_json(path, data): + os.makedirs(os.path.dirname(path), exist_ok=True) + tmp_path = path + ".tmp" + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + os.replace(tmp_path, path) + + +def _read_json_or_default(path, default): + try: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return default + + +def _extract_command_id(payload): + if isinstance(payload, dict): + value = payload.get("command_id") + if isinstance(value, str) and value.strip(): + return value.strip() + return NIL_COMMAND_ID + + +def _as_uuid_str(value): + if not isinstance(value, str) or not value.strip(): + raise ValueError("must be a non-empty string") + return str(uuid.UUID(value.strip())) + + +def _prune_processed_commands(commands): + if not isinstance(commands, dict): + return {} + + cutoff = datetime.now(timezone.utc) - timedelta(hours=max(1, COMMAND_DEDUPE_TTL_HOURS)) + kept = {} + sortable = [] + + for command_id, entry in commands.items(): + if not isinstance(entry, dict): + continue + processed_at = entry.get("processed_at") + if not processed_at: + continue + try: + processed_dt = _parse_utc_iso(processed_at) + except Exception: + continue + if processed_dt < cutoff: + continue + kept[command_id] = entry + sortable.append((processed_dt, command_id)) + + max_entries = max(1, COMMAND_DEDUPE_MAX_ENTRIES) + if len(sortable) > max_entries: + sortable.sort(reverse=True) + keep_ids = {cid for _, cid in sortable[:max_entries]} + kept = {cid: entry for cid, entry in kept.items() if cid in keep_ids} + + return kept + + +def load_processed_commands(): + state = _read_json_or_default(PROCESSED_COMMANDS_FILE, {"commands": {}}) + commands = state.get("commands") if isinstance(state, dict) else {} + commands = _prune_processed_commands(commands) + _atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": commands}) + return commands + + +def persist_processed_commands(commands): + sanitized = _prune_processed_commands(commands) + _atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": sanitized}) + + +def load_last_command_state(): + state = _read_json_or_default(LAST_COMMAND_STATE_FILE, {}) + if isinstance(state, dict): + return state + return {} + + +def write_last_command_state(data): + _atomic_write_json(LAST_COMMAND_STATE_FILE, data) + + +def validate_command_payload(payload, expected_client_uuid): + if not isinstance(payload, dict): + return False, None, "invalid_schema", "payload must be a JSON object" + + required = { + "schema_version", + "command_id", + "client_uuid", + "action", + "issued_at", + "expires_at", + "requested_by", + "reason", + } + payload_keys = set(payload.keys()) + missing = sorted(required - payload_keys) + if missing: + return False, None, "missing_field", f"missing required fields: {', '.join(missing)}" + extras = sorted(payload_keys - required) + if extras: + return False, None, "invalid_schema", f"unexpected fields: {', '.join(extras)}" + + if payload.get("schema_version") != "1.0": + return False, None, "invalid_schema", "schema_version must be 1.0" + + try: + command_id = _as_uuid_str(payload.get("command_id")) + except Exception: + return False, None, "invalid_schema", "command_id must be a valid UUID" + + try: + client_uuid = _as_uuid_str(payload.get("client_uuid")) + except Exception: + return False, None, "invalid_schema", "client_uuid must be a valid UUID" + + try: + expected_uuid = _as_uuid_str(expected_client_uuid) + except Exception: + expected_uuid = str(expected_client_uuid).strip() + + if client_uuid != expected_uuid: + return False, None, "invalid_schema", "client_uuid does not match this client" + + action = payload.get("action") + if action not in COMMAND_ACTIONS: + return False, None, "invalid_schema", f"action must be one of {COMMAND_ACTIONS}" + + try: + issued_at = _parse_utc_iso(payload.get("issued_at")) + expires_at = _parse_utc_iso(payload.get("expires_at")) + except Exception as e: + return False, None, "invalid_schema", f"invalid timestamp: {e}" + + if expires_at <= issued_at: + return False, None, "invalid_schema", "expires_at must be later than issued_at" + if datetime.now(timezone.utc) > expires_at: + return False, None, "stale_command", "command expired" + + requested_by = payload.get("requested_by") + if requested_by is not None: + if not isinstance(requested_by, int) or requested_by < 1: + return False, None, "invalid_schema", "requested_by must be integer >= 1 or null" + + reason = payload.get("reason") + if reason is not None: + if not isinstance(reason, str): + return False, None, "invalid_schema", "reason must be string or null" + if len(reason) > 2000: + return False, None, "invalid_schema", "reason exceeds max length 2000" + + normalized = { + "schema_version": "1.0", + "command_id": command_id, + "client_uuid": client_uuid, + "action": action, + "issued_at": issued_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + "expires_at": expires_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + "requested_by": requested_by, + "reason": reason, + } + return True, normalized, None, None + + +def publish_command_ack( + client, + client_uuid, + command_id, + status, + error_code=None, + error_message=None, + expires_at=None, +): + if status not in ACK_STATUSES: + raise ValueError(f"invalid ack status: {status}") + + if status == "failed": + if not isinstance(error_code, str) or not error_code.strip(): + error_code = "internal_error" + if not isinstance(error_message, str) or not error_message.strip(): + error_message = "failed without diagnostic message" + else: + error_code = None + error_message = None + + if isinstance(error_code, str): + error_code = error_code[:128] + if isinstance(error_message, str): + error_message = error_message[:4000] + + ack_payload = { + "command_id": command_id, + "status": status, + "error_code": error_code, + "error_message": error_message, + } + encoded = json.dumps(ack_payload) + + ack_topics = [ + f"infoscreen/{client_uuid}/commands/ack", + f"infoscreen/{client_uuid}/command/ack", + ] + retry_schedule = [0.5, 1, 2, 4, 5] + attempt = 0 + + while True: + all_ok = True + for topic in ack_topics: + result = client.publish(topic, encoded, qos=1, retain=False) + if result.rc != mqtt.MQTT_ERR_SUCCESS: + all_ok = False + logging.warning( + "Command ack publish failed: topic=%s status=%s rc=%s", + topic, + status, + result.rc, + ) + + if all_ok: + logging.info("Command ack published: command_id=%s status=%s", command_id, status) + return True + + if expires_at: + try: + if datetime.now(timezone.utc) >= _parse_utc_iso(expires_at): + logging.warning("Command ack retry stopped at expiry: command_id=%s", command_id) + return False + except Exception: + pass + + delay = retry_schedule[min(attempt, len(retry_schedule) - 1)] + attempt += 1 + time.sleep(delay) + + def on_message(client, userdata, msg, properties=None): global discovered logging.info(f"Received: {msg.topic} {msg.payload.decode()}") @@ -482,6 +797,71 @@ def get_model(): SOFTWARE_VERSION = "1.0.0" # Optional: Anpassen bei neuen Releases +def _detect_watchdog_enabled(): + env_flag = os.getenv("WATCHDOG_ENABLED", "").strip().lower() + if env_flag in ("1", "true", "yes", "on"): + return True + if os.path.exists("/dev/watchdog"): + return True + return False + + +def _detect_boot_source(): + try: + with open("/proc/cmdline", "r", encoding="utf-8") as f: + cmdline = f.read().strip() + for token in cmdline.split(): + if token.startswith("root="): + return token.split("=", 1)[1] + except Exception: + pass + return "unknown" + + +def configure_mqtt_security(client): + # Prefer broker-scoped auth vars when present, fallback to legacy vars. + auth_username = MQTT_USER or MQTT_USERNAME + auth_password = MQTT_PASSWORD_BROKER if MQTT_USER else MQTT_PASSWORD + + configured = { + "username": bool(auth_username), + "tls": False, + "tls_insecure": False, + } + + if auth_username: + client.username_pw_set(auth_username, auth_password or None) + configured["username"] = True + logging.info("Configured MQTT username/password authentication") + + if not MQTT_TLS_ENABLED: + return configured + + tls_kwargs = { + "ca_certs": MQTT_TLS_CA_CERT or None, + "certfile": MQTT_TLS_CERT or None, + "keyfile": MQTT_TLS_KEY or None, + "tls_version": ssl.PROTOCOL_TLS_CLIENT, + } + client.tls_set(**tls_kwargs) + configured["tls"] = True + + if MQTT_TLS_INSECURE: + client.tls_insecure_set(True) + configured["tls_insecure"] = True + logging.warning("MQTT TLS hostname verification disabled via MQTT_TLS_INSECURE") + else: + client.tls_insecure_set(False) + + logging.info( + "Configured MQTT TLS: ca=%s client_cert=%s client_key=%s", + bool(MQTT_TLS_CA_CERT), + bool(MQTT_TLS_CERT), + bool(MQTT_TLS_KEY), + ) + return configured + + def send_discovery(client, client_id, hardware_token, ip_addr): macs = get_mac_addresses() discovery_msg = { @@ -494,6 +874,12 @@ def send_discovery(client, client_id, hardware_token, ip_addr): "software_version": SOFTWARE_VERSION, "macs": macs, "model": get_model(), + "capabilities": { + "recovery_class": "software_only", + "watchdog_enabled": _detect_watchdog_enabled(), + "boot_source": _detect_boot_source(), + "command_schema_version": "1.0", + }, } client.publish("infoscreen/discovery", json.dumps(discovery_msg)) logging.info(f"Discovery message sent: {discovery_msg}") @@ -766,7 +1152,7 @@ def delete_client_settings(): logging.error(f"Error deleting client settings: {e}") -def publish_health_message(client, client_id): +def publish_health_message(client, client_id, connection_state=None): """Publish health status to server via MQTT""" try: health = read_health_state() @@ -784,6 +1170,17 @@ def publish_health_message(client, client_id): "status": health.get("process_status") } } + + if connection_state is not None: + last_disc = connection_state.get("last_disconnect") + payload["broker_connection"] = { + "broker_reachable": bool(connection_state.get("connected")), + "reconnect_count": connection_state.get("reconnect_count", 0), + "last_disconnect_at": ( + datetime.fromtimestamp(last_disc, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + if last_disc else None + ), + } topic = f"infoscreen/{client_id}/health" res = client.publish(topic, json.dumps(payload), qos=1) @@ -1007,6 +1404,10 @@ def main(): power_intent_topic = None last_power_intent_id = None last_power_issued_at = None + command_topic = f"infoscreen/{client_id}/commands" + command_topic_alias = f"infoscreen/{client_id}/command" + processed_commands = load_processed_commands() + pending_recovery_command = load_last_command_state() # paho-mqtt v2: opt into latest callback API to avoid deprecation warnings. client_kwargs = {"protocol": mqtt.MQTTv311} @@ -1018,12 +1419,18 @@ def main(): pass client = mqtt.Client(**client_kwargs) client.on_message = on_message + configure_mqtt_security(client) # Enable automatic reconnection client.reconnect_delay_set(min_delay=1, max_delay=120) # Connection state tracking - connection_state = {"connected": False, "last_disconnect": None} + connection_state = { + "connected": False, + "last_disconnect": None, + "reconnect_count": 0, + "connect_count": 0, + } # Optional: Enable MQTT debug logging in DEBUG_MODE if DEBUG_MODE: @@ -1090,6 +1497,9 @@ def main(): if rc == 0: connection_state["connected"] = True connection_state["last_disconnect"] = None + connection_state["connect_count"] = connection_state.get("connect_count", 0) + 1 + if connection_state["connect_count"] > 1: + connection_state["reconnect_count"] = connection_state.get("reconnect_count", 0) + 1 # Check if this is a reconnection # paho-mqtt v2 provides ConnectFlags with attribute 'session_present' @@ -1121,6 +1531,11 @@ def main(): group_id_topic = f"infoscreen/{client_id}/group_id" client.subscribe(group_id_topic) logging.info(f"Subscribed to: {group_id_topic}") + + # Command topics (canonical + transitional) + client.subscribe(command_topic, qos=1) + client.subscribe(command_topic_alias, qos=1) + logging.info(f"Subscribed to command topics: {command_topic}, {command_topic_alias}") # Wenn beim Start eine group_id vorhanden ist, sofort Event-Topic abonnieren # Reset event_topic so subscribe_event_topic always re-registers with the broker @@ -1233,6 +1648,189 @@ def main(): client.message_callback_add(group_id_topic, on_group_id_message) logging.info(f"Current group_id at start: {current_group_id if current_group_id else 'none'}") + def mark_command_processed(command_id, status, error_code=None): + processed_commands[command_id] = { + "status": status, + "error_code": error_code, + "processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + } + persist_processed_commands(processed_commands) + + def execute_command_action(action): + try: + proc = subprocess.run( + ["sudo", COMMAND_HELPER_PATH, action], + timeout=max(1, COMMAND_EXEC_TIMEOUT_SEC), + check=False, + capture_output=True, + text=True, + ) + except subprocess.TimeoutExpired: + return False, "execution_timeout", f"command timed out after {COMMAND_EXEC_TIMEOUT_SEC}s" + except PermissionError: + return False, "permission_denied_local", "permission denied invoking command helper" + except Exception as e: + return False, "internal_error", f"internal execution error: {e}" + + if proc.returncode != 0: + stderr = (proc.stderr or "").strip() + if proc.returncode in (126, 127): + return False, "permission_denied_local", stderr or "command helper unavailable" + return False, "execution_failed", stderr or f"helper exited with code {proc.returncode}" + + return True, None, None + + def on_command_message(client, userdata, msg, properties=None): + payload_text = msg.payload.decode().strip() + received_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + expires_at = None + + if not payload_text: + publish_command_ack( + client, + client_id, + NIL_COMMAND_ID, + "failed", + error_code="invalid_schema", + error_message="empty command payload", + ) + return + + try: + payload = json.loads(payload_text) + except json.JSONDecodeError as e: + publish_command_ack( + client, + client_id, + NIL_COMMAND_ID, + "failed", + error_code="invalid_schema", + error_message=f"invalid JSON: {e}", + ) + return + + command_id_hint = _extract_command_id(payload) + if isinstance(payload, dict): + expires_at = payload.get("expires_at") + + is_valid, normalized, error_code, error_message = validate_command_payload(payload, client_id) + if not is_valid: + publish_command_ack( + client, + client_id, + command_id_hint, + "failed", + error_code=error_code, + error_message=error_message, + expires_at=expires_at, + ) + return + + command_id = normalized["command_id"] + expires_at = normalized["expires_at"] + action = normalized["action"] + + if command_id in processed_commands: + publish_command_ack( + client, + client_id, + command_id, + "failed", + error_code="duplicate_command", + error_message="command_id already processed", + expires_at=expires_at, + ) + return + + publish_command_ack( + client, + client_id, + command_id, + "accepted", + expires_at=expires_at, + ) + + publish_command_ack( + client, + client_id, + command_id, + "execution_started", + expires_at=expires_at, + ) + + write_last_command_state({ + "command_id": command_id, + "action": action, + "ack_status": "execution_started", + "received_at": received_at, + "execution_started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "expires_at": expires_at, + "source_topic": msg.topic, + }) + + ok, exec_error_code, exec_error_message = execute_command_action(action) + if ok: + if command_requires_recovery_completion(action): + if command_mock_reboot_immediate_complete_enabled(action): + logging.info( + "Mock reboot immediate completion enabled: command_id=%s action=%s", + command_id, + action, + ) + else: + logging.info( + "Command entered recovery completion path: command_id=%s action=%s", + command_id, + action, + ) + return + + logging.info( + "Command continuing to immediate completion path: command_id=%s action=%s", + command_id, + action, + ) + + publish_command_ack( + client, + client_id, + command_id, + "completed", + expires_at=expires_at, + ) + mark_command_processed(command_id, "completed") + write_last_command_state({ + "command_id": command_id, + "action": action, + "ack_status": "completed", + "completed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "expires_at": expires_at, + }) + return + + publish_command_ack( + client, + client_id, + command_id, + "failed", + error_code=exec_error_code, + error_message=exec_error_message, + expires_at=expires_at, + ) + mark_command_processed(command_id, "failed", error_code=exec_error_code) + write_last_command_state({ + "command_id": command_id, + "action": action, + "ack_status": "failed", + "error_code": exec_error_code, + "error_message": exec_error_message, + "failed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "expires_at": expires_at, + }) + + client.message_callback_add(command_topic, on_command_message) + client.message_callback_add(command_topic_alias, on_command_message) + def on_power_intent_message(client, userdata, msg, properties=None): nonlocal last_power_intent_id, last_power_issued_at @@ -1396,7 +1994,8 @@ def main(): # Heartbeat-Loop with connection state monitoring last_heartbeat = 0 logging.info("Entering heartbeat loop (network loop already running in background thread)") - + _sd_notify("READY=1") # tell systemd the process is fully initialised + while True: try: current_time = time.time() @@ -1416,7 +2015,31 @@ def main(): if result.rc == mqtt.MQTT_ERR_SUCCESS: logging.info("Heartbeat sent.") # Also send health and screenshot heartbeats - publish_health_message(client, client_id) + publish_health_message(client, client_id, connection_state) + if ( + isinstance(pending_recovery_command, dict) + and pending_recovery_command.get("ack_status") == "execution_started" + and pending_recovery_command.get("action") == "reboot_host" + and pending_recovery_command.get("command_id") + ): + recovered_command_id = pending_recovery_command.get("command_id") + recovered_expires = pending_recovery_command.get("expires_at") + publish_command_ack( + client, + client_id, + recovered_command_id, + "completed", + expires_at=recovered_expires, + ) + mark_command_processed(recovered_command_id, "completed") + write_last_command_state({ + "command_id": recovered_command_id, + "action": pending_recovery_command.get("action"), + "ack_status": "recovered", + "recovered_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "expires_at": recovered_expires, + }) + pending_recovery_command = None elif result.rc == mqtt.MQTT_ERR_NO_CONN: logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...") time.sleep(2) @@ -1424,7 +2047,7 @@ def main(): retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0) if retry.rc == mqtt.MQTT_ERR_SUCCESS: logging.info("Heartbeat sent after retry.") - publish_health_message(client, client_id) + publish_health_message(client, client_id, connection_state) else: logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}") else: @@ -1435,6 +2058,7 @@ def main(): logging.debug("Skipping heartbeat - MQTT not connected (is_connected=False)") last_heartbeat = current_time + _sd_notify("WATCHDOG=1") # kick systemd watchdog each loop iteration time.sleep(5) except KeyboardInterrupt: logging.info("Shutting down gracefully...") diff --git a/tests/test_command_intake.py b/tests/test_command_intake.py new file mode 100644 index 0000000..57fd921 --- /dev/null +++ b/tests/test_command_intake.py @@ -0,0 +1,287 @@ +""" +Unit tests for reboot/shutdown command intake primitives. + +Run from project root (venv activated): + python -m pytest tests/test_command_intake.py -v +""" + +import os +import sys +import json +import tempfile +import unittest +from datetime import datetime, timezone, timedelta +from unittest.mock import patch + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) + +from simclient import ( # noqa: E402 + NIL_COMMAND_ID, + command_requires_recovery_completion, + command_mock_reboot_immediate_complete_enabled, + configure_mqtt_security, + mqtt, + validate_command_payload, + publish_command_ack, + _prune_processed_commands, + load_processed_commands, + persist_processed_commands, +) + + +class FakePublishResult: + def __init__(self, rc): + self.rc = rc + + +class FakeMqttClient: + def __init__(self, rc=0): + self.rc = rc + self.calls = [] + + def publish(self, topic, payload, qos=0, retain=False): + self.calls.append({ + "topic": topic, + "payload": payload, + "qos": qos, + "retain": retain, + }) + return FakePublishResult(self.rc) + + +class SequencedMqttClient: + def __init__(self, rc_sequence): + self._rc_sequence = list(rc_sequence) + self.calls = [] + + def publish(self, topic, payload, qos=0, retain=False): + rc = self._rc_sequence.pop(0) if self._rc_sequence else 0 + self.calls.append({ + "topic": topic, + "payload": payload, + "qos": qos, + "retain": retain, + "rc": rc, + }) + return FakePublishResult(rc) + + +class FakeSecurityClient: + def __init__(self): + self.username = None + self.password = None + self.tls_kwargs = None + self.tls_insecure = None + + def username_pw_set(self, username, password=None): + self.username = username + self.password = password + + def tls_set(self, **kwargs): + self.tls_kwargs = kwargs + + def tls_insecure_set(self, enabled): + self.tls_insecure = enabled + + +def _valid_payload(seconds_valid=240): + now = datetime.now(timezone.utc) + exp = now + timedelta(seconds=seconds_valid) + return { + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": now.strftime("%Y-%m-%dT%H:%M:%SZ"), + "expires_at": exp.strftime("%Y-%m-%dT%H:%M:%SZ"), + "requested_by": 1, + "reason": "operator_request", + } + + +class TestValidateCommandPayload(unittest.TestCase): + def test_accepts_valid_payload(self): + payload = _valid_payload() + ok, normalized, code, msg = validate_command_payload(payload, payload["client_uuid"]) + self.assertTrue(ok) + self.assertIsNone(code) + self.assertIsNone(msg) + self.assertEqual(normalized["action"], "reboot_host") + + def test_rejects_extra_fields(self): + payload = _valid_payload() + payload["extra"] = "x" + ok, _, code, msg = validate_command_payload(payload, payload["client_uuid"]) + self.assertFalse(ok) + self.assertEqual(code, "invalid_schema") + self.assertIn("unexpected fields", msg) + + def test_rejects_stale_command(self): + payload = _valid_payload() + old_issued = datetime.now(timezone.utc) - timedelta(hours=3) + old_expires = datetime.now(timezone.utc) - timedelta(hours=2) + payload["issued_at"] = old_issued.strftime("%Y-%m-%dT%H:%M:%SZ") + payload["expires_at"] = old_expires.strftime("%Y-%m-%dT%H:%M:%SZ") + ok, _, code, _ = validate_command_payload(payload, payload["client_uuid"]) + self.assertFalse(ok) + self.assertEqual(code, "stale_command") + + def test_rejects_action_outside_enum(self): + payload = _valid_payload() + payload["action"] = "restart_service" + ok, _, code, msg = validate_command_payload(payload, payload["client_uuid"]) + self.assertFalse(ok) + self.assertEqual(code, "invalid_schema") + self.assertIn("action must be one of", msg) + + def test_rejects_client_uuid_mismatch(self): + payload = _valid_payload() + ok, _, code, msg = validate_command_payload( + payload, + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + ) + self.assertFalse(ok) + self.assertEqual(code, "invalid_schema") + self.assertIn("client_uuid", msg) + + +class TestCommandLifecyclePolicy(unittest.TestCase): + def test_reboot_requires_recovery_completion(self): + self.assertTrue(command_requires_recovery_completion("reboot_host")) + self.assertFalse(command_requires_recovery_completion("shutdown_host")) + + def test_mock_reboot_immediate_completion_enabled_for_mock_helper(self): + with patch("simclient.COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", True), \ + patch("simclient.COMMAND_HELPER_PATH", "/home/pi/scripts/mock-command-helper.sh"): + self.assertTrue(command_mock_reboot_immediate_complete_enabled("reboot_host")) + + def test_mock_reboot_immediate_completion_disabled_for_live_helper(self): + with patch("simclient.COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", True), \ + patch("simclient.COMMAND_HELPER_PATH", "/usr/local/bin/infoscreen-cmd-helper.sh"): + self.assertFalse(command_mock_reboot_immediate_complete_enabled("reboot_host")) + + +class TestMqttSecurityConfiguration(unittest.TestCase): + def test_configure_username_password(self): + fake_client = FakeSecurityClient() + with patch("simclient.MQTT_USER", ""), \ + patch("simclient.MQTT_PASSWORD_BROKER", ""), \ + patch("simclient.MQTT_USERNAME", "client-user"), \ + patch("simclient.MQTT_PASSWORD", "client-pass"), \ + patch("simclient.MQTT_TLS_ENABLED", False): + configured = configure_mqtt_security(fake_client) + + self.assertEqual(fake_client.username, "client-user") + self.assertEqual(fake_client.password, "client-pass") + self.assertFalse(configured["tls"]) + + def test_configure_tls(self): + fake_client = FakeSecurityClient() + with patch("simclient.MQTT_USER", ""), \ + patch("simclient.MQTT_PASSWORD_BROKER", ""), \ + patch("simclient.MQTT_USERNAME", ""), \ + patch("simclient.MQTT_PASSWORD", ""), \ + patch("simclient.MQTT_TLS_ENABLED", True), \ + patch("simclient.MQTT_TLS_CA_CERT", "/tmp/ca.pem"), \ + patch("simclient.MQTT_TLS_CERT", "/tmp/client.pem"), \ + patch("simclient.MQTT_TLS_KEY", "/tmp/client.key"), \ + patch("simclient.MQTT_TLS_INSECURE", True): + configured = configure_mqtt_security(fake_client) + + self.assertTrue(configured["tls"]) + self.assertEqual(fake_client.tls_kwargs["ca_certs"], "/tmp/ca.pem") + self.assertEqual(fake_client.tls_kwargs["certfile"], "/tmp/client.pem") + self.assertEqual(fake_client.tls_kwargs["keyfile"], "/tmp/client.key") + self.assertTrue(fake_client.tls_insecure) + + +class TestAckPublish(unittest.TestCase): + def test_failed_ack_forces_non_null_error_fields(self): + fake_client = FakeMqttClient(rc=0) + ok = publish_command_ack( + fake_client, + "9b8d1856-ff34-4864-a726-12de072d0f77", + NIL_COMMAND_ID, + "failed", + error_code=None, + error_message=None, + ) + self.assertTrue(ok) + self.assertEqual(len(fake_client.calls), 2) + payload = json.loads(fake_client.calls[0]["payload"]) + self.assertEqual(payload["status"], "failed") + self.assertTrue(isinstance(payload["error_code"], str) and payload["error_code"]) + self.assertTrue(isinstance(payload["error_message"], str) and payload["error_message"]) + + def test_retry_on_broker_disconnect_then_success(self): + # First loop (2 topics): NO_CONN, NO_CONN. Second loop: success, success. + fake_client = SequencedMqttClient([ + mqtt.MQTT_ERR_NO_CONN, + mqtt.MQTT_ERR_NO_CONN, + mqtt.MQTT_ERR_SUCCESS, + mqtt.MQTT_ERR_SUCCESS, + ]) + future_expiry = (datetime.now(timezone.utc) + timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ") + + with patch("simclient.time.sleep", return_value=None) as sleep_mock: + ok = publish_command_ack( + fake_client, + "9b8d1856-ff34-4864-a726-12de072d0f77", + "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "accepted", + expires_at=future_expiry, + ) + + self.assertTrue(ok) + self.assertEqual(len(fake_client.calls), 4) + sleep_mock.assert_called_once() + + def test_stop_retry_when_expired(self): + fake_client = SequencedMqttClient([ + mqtt.MQTT_ERR_NO_CONN, + mqtt.MQTT_ERR_NO_CONN, + ]) + past_expiry = (datetime.now(timezone.utc) - timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ") + + with patch("simclient.time.sleep", return_value=None) as sleep_mock: + ok = publish_command_ack( + fake_client, + "9b8d1856-ff34-4864-a726-12de072d0f77", + "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "accepted", + expires_at=past_expiry, + ) + + self.assertFalse(ok) + self.assertEqual(len(fake_client.calls), 2) + sleep_mock.assert_not_called() + + +class TestProcessedCommandsState(unittest.TestCase): + def test_prune_keeps_recent_only(self): + recent = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + old = (datetime.now(timezone.utc) - timedelta(hours=30)).strftime("%Y-%m-%dT%H:%M:%SZ") + commands = { + "a": {"processed_at": recent, "status": "completed"}, + "b": {"processed_at": old, "status": "completed"}, + } + pruned = _prune_processed_commands(commands) + self.assertIn("a", pruned) + self.assertNotIn("b", pruned) + + def test_load_and_persist_round_trip(self): + with tempfile.TemporaryDirectory() as tmpdir: + state_file = os.path.join(tmpdir, "processed_commands.json") + with patch("simclient.PROCESSED_COMMANDS_FILE", state_file): + persist_processed_commands({ + "x": { + "status": "completed", + "processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + } + }) + loaded = load_processed_commands() + self.assertIn("x", loaded) + + +if __name__ == "__main__": + unittest.main()