feat: remote commands, systemd units, process observability, broker auth split
- Command intake (reboot/shutdown) on infoscreen/{uuid}/commands with ack lifecycle
- MQTT_USER/MQTT_PASSWORD_BROKER split from identity vars; configure_mqtt_security() updated
- infoscreen-simclient.service: Type=notify, WatchdogSec=60, Restart=on-failure
- infoscreen-notify-failure@.service + script: retained MQTT alert when systemd gives up (Gap 3)
- _sd_notify() watchdog keepalive in simclient main loop (Gap 1)
- broker_connection block in health payload: reconnect_count, last_disconnect_at (Gap 2)
- COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE canary flag with safety guard
- SERVER_TEAM_ACTIONS.md: server-side integration action items
- Docs: README, CHANGELOG, src/README, copilot-instructions updated
- 43 tests passing
This commit is contained in:
@@ -16,6 +16,17 @@ LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
|
|||||||
# MQTT Broker Configuration
|
# MQTT Broker Configuration
|
||||||
MQTT_BROKER=<your-mqtt-broker-host-or-ip> # Change to your MQTT server IP
|
MQTT_BROKER=<your-mqtt-broker-host-or-ip> # Change to your MQTT server IP
|
||||||
MQTT_PORT=1883
|
MQTT_PORT=1883
|
||||||
|
# Broker login used by simclient to connect to MQTT
|
||||||
|
MQTT_USER=<broker-username>
|
||||||
|
MQTT_PASSWORD_BROKER=<broker-password>
|
||||||
|
# Optional per-device identity credentials (legacy fallback)
|
||||||
|
MQTT_USERNAME=infoscreen-client-<client-uuid-prefix>
|
||||||
|
MQTT_PASSWORD=<set-per-device-20-char-random-password>
|
||||||
|
MQTT_TLS_ENABLED=0 # 1 when broker TLS is enabled for this client
|
||||||
|
# MQTT_TLS_CA_CERT=/etc/infoscreen/mqtt/ca.crt
|
||||||
|
# MQTT_TLS_CERT=/etc/infoscreen/mqtt/client.crt
|
||||||
|
# MQTT_TLS_KEY=/etc/infoscreen/mqtt/client.key
|
||||||
|
# MQTT_TLS_INSECURE=0 # only for controlled test environments
|
||||||
|
|
||||||
# Timing Configuration (quieter intervals for productive test)
|
# Timing Configuration (quieter intervals for productive test)
|
||||||
HEARTBEAT_INTERVAL=60 # Heartbeat frequency in seconds
|
HEARTBEAT_INTERVAL=60 # Heartbeat frequency in seconds
|
||||||
@@ -47,16 +58,26 @@ CEC_POWER_OFF_WAIT=5 # Seconds to wait after power OFF command (increase
|
|||||||
# hybrid — prefer MQTT intent when present and valid; fall back to local CEC if not
|
# hybrid — prefer MQTT intent when present and valid; fall back to local CEC if not
|
||||||
# mqtt — MQTT intent is authoritative; local CEC only fires as last-resort guard
|
# mqtt — MQTT intent is authoritative; local CEC only fires as last-resort guard
|
||||||
# See README.md "TV Power Intent — Rollout Runbook" before changing from 'local'.
|
# See README.md "TV Power Intent — Rollout Runbook" before changing from 'local'.
|
||||||
POWER_CONTROL_MODE=local # local | hybrid | mqtt
|
POWER_CONTROL_MODE=hybrid # local | hybrid | mqtt
|
||||||
|
|
||||||
# Optional: MQTT authentication (if your broker requires username/password)
|
# Reboot/Shutdown command handling
|
||||||
#MQTT_USERNAME=
|
# Helper installed by ./scripts/install-command-helper.sh
|
||||||
#MQTT_PASSWORD=
|
COMMAND_HELPER_PATH=/usr/local/bin/infoscreen-cmd-helper.sh
|
||||||
|
# Mock mode (safe canary): uncomment next line and comment the live path above
|
||||||
|
# COMMAND_HELPER_PATH=/home/olafn/infoscreen-dev/scripts/mock-command-helper.sh
|
||||||
|
# Timeout for helper execution (seconds)
|
||||||
|
COMMAND_EXEC_TIMEOUT_SEC=15
|
||||||
|
# Test mode: for reboot_host with mock helper, send completed without restart (0/1)
|
||||||
|
COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE=0
|
||||||
|
# Command deduplication retention window (hours)
|
||||||
|
COMMAND_DEDUPE_TTL_HOURS=24
|
||||||
|
# Maximum processed command IDs kept in dedupe cache
|
||||||
|
COMMAND_DEDUPE_MAX_ENTRIES=5000
|
||||||
|
|
||||||
|
# MQTT authentication
|
||||||
|
# Use a per-client service account. Keep this file mode 600 on the device.
|
||||||
|
|
||||||
# Optional TLS settings (if using secure MQTT)
|
# Optional TLS settings (if using secure MQTT)
|
||||||
#MQTT_TLS_CA_CERT=
|
|
||||||
#MQTT_TLS_CERT=
|
|
||||||
#MQTT_TLS_KEY=
|
|
||||||
|
|
||||||
# Notes:
|
# Notes:
|
||||||
# - Keep actual secrets and host-specific values in a local .env file that is NOT committed.
|
# - Keep actual secrets and host-specific values in a local .env file that is NOT committed.
|
||||||
|
|||||||
6
.github/copilot-instructions.md
vendored
6
.github/copilot-instructions.md
vendored
@@ -45,6 +45,7 @@ Use specialist docs for deep operational details:
|
|||||||
- `HDMI_CEC_SETUP.md` (CEC setup/troubleshooting)
|
- `HDMI_CEC_SETUP.md` (CEC setup/troubleshooting)
|
||||||
- `SCREENSHOT_MQTT_FIX.md` (screenshot race-condition fixes)
|
- `SCREENSHOT_MQTT_FIX.md` (screenshot race-condition fixes)
|
||||||
- `src/README.md` (developer-focused architecture/debugging)
|
- `src/README.md` (developer-focused architecture/debugging)
|
||||||
|
- `SERVER_TEAM_ACTIONS.md` (server-side integration action items)
|
||||||
|
|
||||||
## Critical Rules
|
## Critical Rules
|
||||||
|
|
||||||
@@ -60,12 +61,13 @@ Use specialist docs for deep operational details:
|
|||||||
- Root `README.md` is a landing page; do not re-expand it into a full manual.
|
- Root `README.md` is a landing page; do not re-expand it into a full manual.
|
||||||
- TV power rollout guidance lives in `TV_POWER_RUNBOOK.md`.
|
- TV power rollout guidance lives in `TV_POWER_RUNBOOK.md`.
|
||||||
- TV power contract truth lives in `TV_POWER_INTENT_SERVER_CONTRACT_V1.md`.
|
- TV power contract truth lives in `TV_POWER_INTENT_SERVER_CONTRACT_V1.md`.
|
||||||
|
- `MQTT_USER`/`MQTT_PASSWORD_BROKER` are broker login credentials; `MQTT_USERNAME`/`MQTT_PASSWORD` are legacy identity fields. Never confuse the two.
|
||||||
|
|
||||||
## Architecture Snapshot
|
## Architecture Snapshot
|
||||||
|
|
||||||
Two-process design:
|
Two-process design:
|
||||||
|
|
||||||
- `src/simclient.py`: MQTT communication, discovery, group assignment, event intake, heartbeat, dashboard publish, power intent ingestion.
|
- `src/simclient.py`: MQTT communication, discovery, group assignment, event intake, heartbeat, dashboard publish, power intent ingestion, remote command intake.
|
||||||
- `src/display_manager.py`: content display lifecycle, HDMI-CEC, screenshot capture, runtime process health.
|
- `src/display_manager.py`: content display lifecycle, HDMI-CEC, screenshot capture, runtime process health.
|
||||||
|
|
||||||
Runtime coordination files:
|
Runtime coordination files:
|
||||||
@@ -105,6 +107,8 @@ Runtime coordination files:
|
|||||||
- Power intent application: `src/display_manager.py` -> `_apply_mqtt_power_intent()`
|
- Power intent application: `src/display_manager.py` -> `_apply_mqtt_power_intent()`
|
||||||
- Screenshot capture logic: `src/display_manager.py` -> `_capture_screenshot()`
|
- Screenshot capture logic: `src/display_manager.py` -> `_capture_screenshot()`
|
||||||
- Dashboard payload: `src/simclient.py` -> `_build_dashboard_payload()`
|
- Dashboard payload: `src/simclient.py` -> `_build_dashboard_payload()`
|
||||||
|
- Remote command intake: `src/simclient.py` -> `on_command_message()`
|
||||||
|
- Command validation: `src/simclient.py` -> `validate_command_payload()`
|
||||||
- File URL rewriting: `src/simclient.py` -> `resolve_file_url()`
|
- File URL rewriting: `src/simclient.py` -> `resolve_file_url()`
|
||||||
|
|
||||||
## Documentation Policy
|
## Documentation Policy
|
||||||
|
|||||||
41
CHANGELOG.md
41
CHANGELOG.md
@@ -2,6 +2,47 @@
|
|||||||
|
|
||||||
## April 2026
|
## April 2026
|
||||||
|
|
||||||
|
### Remote Command Intake
|
||||||
|
|
||||||
|
- Added MQTT command intake on `infoscreen/{client_id}/commands` (supports `reboot` and `shutdown`).
|
||||||
|
- Added command acknowledgement publishing to `infoscreen/{client_id}/commands/ack` and `infoscreen/{client_id}/command/ack` with states `accepted`, `rejected`, `execution_started`, `completed`, `failed`.
|
||||||
|
- Added `COMMAND_HELPER_PATH` environment variable; command execution delegated to an external shell helper so `simclient.py` requires no elevated privileges.
|
||||||
|
- Added deduplication of commands by `command_id` with configurable TTL (`COMMAND_DEDUPE_TTL_HOURS`) and max-entries cap (`COMMAND_DEDUPE_MAX_ENTRIES`).
|
||||||
|
- Added execution timeout (`COMMAND_EXEC_TIMEOUT_SEC`).
|
||||||
|
- Added `COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE` flag for canary and test environments — immediately completes a mock reboot without waiting for process restart. Safety-guarded: only activates when the helper basename is `mock-command-helper.sh`.
|
||||||
|
|
||||||
|
### MQTT Broker Authentication Split
|
||||||
|
|
||||||
|
- Split broker connection credentials (`MQTT_USER`, `MQTT_PASSWORD_BROKER`) from legacy per-device identity fields (`MQTT_USERNAME`, `MQTT_PASSWORD`).
|
||||||
|
- `configure_mqtt_security()` now prefers `MQTT_USER`/`MQTT_PASSWORD_BROKER` for broker login, with fallback to legacy vars if broker-specific vars are absent.
|
||||||
|
|
||||||
|
### Systemd Service Units
|
||||||
|
|
||||||
|
- Added `scripts/infoscreen-simclient.service` — systemd unit for `simclient.py` with `Type=notify`, `WatchdogSec=60`, `Restart=on-failure`, `StartLimitBurst=5`.
|
||||||
|
- Added `scripts/start-simclient.sh` — launcher script mirroring `start-display-manager.sh`.
|
||||||
|
- Updated `scripts/infoscreen-display.service` with `OnFailure=infoscreen-notify-failure@%n.service`.
|
||||||
|
- Updated `src/pi-setup.sh` to install and enable both units plus the failure notifier template.
|
||||||
|
|
||||||
|
### Process Watchdog (Gap 1 — Hung Process Detection)
|
||||||
|
|
||||||
|
- Added zero-dependency `_sd_notify()` raw socket helper in `simclient.py` (no `systemd-python` package required).
|
||||||
|
- Sends `READY=1` on main loop entry and `WATCHDOG=1` on every 5-second iteration.
|
||||||
|
- Service unit uses `Type=notify` and `WatchdogSec=60`; systemd will restart the process if it stops sending keepalives for 60 seconds.
|
||||||
|
|
||||||
|
### OnFailure MQTT Notifier (Gap 3 — systemd Give-Up Detection)
|
||||||
|
|
||||||
|
- Added `scripts/infoscreen-notify-failure@.service` — systemd template unit triggered by `OnFailure=`.
|
||||||
|
- Added `scripts/infoscreen-notify-failure.sh` — publishes a retained JSON payload to `infoscreen/{uuid}/service_failed` via `mosquitto_pub` so the monitoring dashboard gets an alert even when the process is fully dead.
|
||||||
|
- Payload: `{"event":"service_failed","unit":"<unit-name>","client_uuid":"...","failed_at":"<ISO-UTC>"}`.
|
||||||
|
|
||||||
|
### Health Payload Broker Connection Block (Gap 2 — Broker vs. Process Ambiguity)
|
||||||
|
|
||||||
|
- Added `broker_connection` block to the health payload: `broker_reachable`, `reconnect_count`, `connect_count`, `last_disconnect_at`.
|
||||||
|
- `simclient.py` now tracks `reconnect_count` and `connect_count` on every `on_connect` callback and `last_disconnect` timestamp on `on_disconnect`.
|
||||||
|
- `publish_health_message()` accepts an optional `connection_state` parameter; both heartbeat-success call sites pass the enriched state.
|
||||||
|
|
||||||
|
### TV Power Coordination
|
||||||
|
|
||||||
- Added Phase 1 TV power coordination on `infoscreen/groups/{group_id}/power/intent`.
|
- Added Phase 1 TV power coordination on `infoscreen/groups/{group_id}/power/intent`.
|
||||||
- Added `POWER_CONTROL_MODE` with `local`, `hybrid`, and `mqtt` behavior.
|
- Added `POWER_CONTROL_MODE` with `local`, `hybrid`, and `mqtt` behavior.
|
||||||
- Added `src/power_intent_state.json` and `src/power_state.json` for power IPC and telemetry.
|
- Added `src/power_intent_state.json` and `src/power_state.json` for power IPC and telemetry.
|
||||||
|
|||||||
44
README.md
44
README.md
@@ -51,6 +51,11 @@ LOG_LEVEL=INFO
|
|||||||
|
|
||||||
MQTT_BROKER=192.168.1.100
|
MQTT_BROKER=192.168.1.100
|
||||||
MQTT_PORT=1883
|
MQTT_PORT=1883
|
||||||
|
MQTT_USER=<broker-username>
|
||||||
|
MQTT_PASSWORD_BROKER=<broker-password>
|
||||||
|
MQTT_USERNAME=infoscreen-client-<client-uuid-prefix>
|
||||||
|
MQTT_PASSWORD=<per-device-random-password>
|
||||||
|
MQTT_TLS_ENABLED=0
|
||||||
|
|
||||||
HEARTBEAT_INTERVAL=60
|
HEARTBEAT_INTERVAL=60
|
||||||
SCREENSHOT_INTERVAL=180
|
SCREENSHOT_INTERVAL=180
|
||||||
@@ -68,8 +73,22 @@ CEC_POWER_ON_WAIT=5
|
|||||||
CEC_POWER_OFF_WAIT=5
|
CEC_POWER_OFF_WAIT=5
|
||||||
|
|
||||||
POWER_CONTROL_MODE=local
|
POWER_CONTROL_MODE=local
|
||||||
|
|
||||||
|
COMMAND_HELPER_PATH=/usr/local/bin/infoscreen-cmd-helper.sh
|
||||||
|
COMMAND_EXEC_TIMEOUT_SEC=15
|
||||||
|
COMMAND_DEDUPE_TTL_HOURS=24
|
||||||
|
COMMAND_DEDUPE_MAX_ENTRIES=5000
|
||||||
|
COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE=0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
MQTT auth/TLS notes:
|
||||||
|
|
||||||
|
- `MQTT_USER` / `MQTT_PASSWORD_BROKER` are the broker credentials used at connection time.
|
||||||
|
- `MQTT_USERNAME` / `MQTT_PASSWORD` are legacy per-device identity fields kept for fallback and identity purposes.
|
||||||
|
- Store real broker credentials only in the local [/.env](.env), which is gitignored.
|
||||||
|
- When TLS is enabled, also set `MQTT_TLS_CA_CERT`, and if client certificates are used, `MQTT_TLS_CERT` and `MQTT_TLS_KEY`.
|
||||||
|
- Keep the local [/.env](.env) readable only by the service user and admins, for example mode `600`.
|
||||||
|
|
||||||
Mode summary:
|
Mode summary:
|
||||||
|
|
||||||
- `POWER_CONTROL_MODE=local`: local event-time CEC only.
|
- `POWER_CONTROL_MODE=local`: local event-time CEC only.
|
||||||
@@ -78,21 +97,23 @@ Mode summary:
|
|||||||
|
|
||||||
### 3. Start Services
|
### 3. Start Services
|
||||||
|
|
||||||
|
The preferred method on deployed devices is systemd:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd ~/infoscreen-dev/src
|
sudo systemctl start infoscreen-simclient infoscreen-display
|
||||||
python3 simclient.py
|
sudo systemctl status infoscreen-simclient infoscreen-display
|
||||||
|
sudo journalctl -u infoscreen-simclient -u infoscreen-display -f
|
||||||
```
|
```
|
||||||
|
|
||||||
In a second terminal:
|
For first-time setup, run `src/pi-setup.sh` to install and enable the units. See [src/README.md](src/README.md) for the systemd setup steps.
|
||||||
|
|
||||||
|
For local development without systemd:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd ~/infoscreen-dev/src
|
# Terminal 1
|
||||||
python3 display_manager.py
|
./scripts/start-simclient.sh
|
||||||
```
|
|
||||||
|
|
||||||
Or use the helper script:
|
# Terminal 2
|
||||||
|
|
||||||
```bash
|
|
||||||
./scripts/start-display-manager.sh
|
./scripts/start-display-manager.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -154,6 +175,7 @@ Use the helper scripts in `scripts/` for focused tests:
|
|||||||
- `./scripts/test-impressive.sh`: single-play presentation.
|
- `./scripts/test-impressive.sh`: single-play presentation.
|
||||||
- `./scripts/test-impressive-loop.sh`: looping presentation.
|
- `./scripts/test-impressive-loop.sh`: looping presentation.
|
||||||
- `./scripts/test-mqtt.sh`: MQTT broker connectivity.
|
- `./scripts/test-mqtt.sh`: MQTT broker connectivity.
|
||||||
|
- `./scripts/test-reboot-command.sh`: end-to-end reboot/shutdown command lifecycle canary (`accepted -> execution_started -> completed/failed`).
|
||||||
- `./scripts/test-screenshot.sh`: screenshot capture.
|
- `./scripts/test-screenshot.sh`: screenshot capture.
|
||||||
- `./scripts/test-hdmi-cec.sh`: HDMI-CEC diagnostics and runtime state inspection.
|
- `./scripts/test-hdmi-cec.sh`: HDMI-CEC diagnostics and runtime state inspection.
|
||||||
- `./scripts/test-power-intent.sh`: MQTT power intent publishing, rejection tests, and telemetry checks.
|
- `./scripts/test-power-intent.sh`: MQTT power intent publishing, rejection tests, and telemetry checks.
|
||||||
@@ -173,7 +195,8 @@ Quick checks:
|
|||||||
- Follow logs: `tail -f logs/display_manager.log src/simclient.log`
|
- Follow logs: `tail -f logs/display_manager.log src/simclient.log`
|
||||||
- Inspect screenshots: `ls -lh src/screenshots/`
|
- Inspect screenshots: `ls -lh src/screenshots/`
|
||||||
- Inspect power state: `cat src/power_intent_state.json` and `cat src/power_state.json`
|
- Inspect power state: `cat src/power_intent_state.json` and `cat src/power_state.json`
|
||||||
- Restart both services: `./scripts/restart-all.sh`
|
- Restart services (systemd): `sudo systemctl restart infoscreen-simclient infoscreen-display`
|
||||||
|
- Restart services (dev): `./scripts/restart-all.sh`
|
||||||
|
|
||||||
## Deployment
|
## Deployment
|
||||||
|
|
||||||
@@ -213,6 +236,7 @@ If running directly on the host, ensure:
|
|||||||
- [src/IMPLEMENTATION_SUMMARY.md](src/IMPLEMENTATION_SUMMARY.md)
|
- [src/IMPLEMENTATION_SUMMARY.md](src/IMPLEMENTATION_SUMMARY.md)
|
||||||
- [TV_POWER_COORDINATION_TASKLIST.md](TV_POWER_COORDINATION_TASKLIST.md)
|
- [TV_POWER_COORDINATION_TASKLIST.md](TV_POWER_COORDINATION_TASKLIST.md)
|
||||||
- [TV_POWER_HANDOFF_SERVER.md](TV_POWER_HANDOFF_SERVER.md)
|
- [TV_POWER_HANDOFF_SERVER.md](TV_POWER_HANDOFF_SERVER.md)
|
||||||
|
- [SERVER_TEAM_ACTIONS.md](SERVER_TEAM_ACTIONS.md)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
|||||||
127
SERVER_TEAM_ACTIONS.md
Normal file
127
SERVER_TEAM_ACTIONS.md
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# Server Team Action Items — Infoscreen Client
|
||||||
|
|
||||||
|
This document lists everything the server/infrastructure/frontend team must implement to complete the client integration. The client-side code is production-ready for all items listed here.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. MQTT Broker Hardening (prerequisite for everything else)
|
||||||
|
|
||||||
|
- Disable anonymous access on the broker.
|
||||||
|
- Create one broker account **per client device**:
|
||||||
|
- Username convention: `infoscreen-client-<uuid-prefix>` (e.g. `infoscreen-client-9b8d1856`)
|
||||||
|
- Provision the password to the device `.env` as `MQTT_PASSWORD_BROKER=`
|
||||||
|
- Create a **server/publisher account** (e.g. `infoscreen-server`) for all server-side publishes.
|
||||||
|
- Enforce ACLs:
|
||||||
|
|
||||||
|
| Topic | Publisher |
|
||||||
|
|---|---|
|
||||||
|
| `infoscreen/{uuid}/commands` | server only |
|
||||||
|
| `infoscreen/{uuid}/command` (alias) | server only |
|
||||||
|
| `infoscreen/{uuid}/group_id` | server only |
|
||||||
|
| `infoscreen/events/{group_id}` | server only |
|
||||||
|
| `infoscreen/groups/+/power/intent` | server only |
|
||||||
|
| `infoscreen/{uuid}/commands/ack` | client only |
|
||||||
|
| `infoscreen/{uuid}/command/ack` | client only |
|
||||||
|
| `infoscreen/{uuid}/heartbeat` | client only |
|
||||||
|
| `infoscreen/{uuid}/health` | client only |
|
||||||
|
| `infoscreen/{uuid}/logs/#` | client only |
|
||||||
|
| `infoscreen/{uuid}/service_failed` | client only |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Reboot / Shutdown Command — Ack Lifecycle
|
||||||
|
|
||||||
|
Client publishes ack status updates to two topics per command (canonical + transitional alias):
|
||||||
|
- `infoscreen/{uuid}/commands/ack`
|
||||||
|
- `infoscreen/{uuid}/command/ack`
|
||||||
|
|
||||||
|
**Ack payload schema (v1, frozen):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"command_id": "07aab032-53c2-45ef-a5a3-6aa58e9d9fae",
|
||||||
|
"status": "accepted | execution_started | completed | failed",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status lifecycle:**
|
||||||
|
|
||||||
|
| Status | When | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `accepted` | Command received and validated | Immediate |
|
||||||
|
| `execution_started` | Helper invoked | Immediate after accepted |
|
||||||
|
| `completed` | Execution confirmed | For `reboot_host`: arrives after reconnect (10–90 s after `execution_started`) |
|
||||||
|
| `failed` | Helper returned error | `error_code` and `error_message` will be set |
|
||||||
|
|
||||||
|
**Server must:**
|
||||||
|
- Track `command_id` through the full lifecycle and update status in DB/UI.
|
||||||
|
- Surface `failed` + `error_code` to the operator UI.
|
||||||
|
- Expect `reboot_host` `completed` to arrive after a reconnect delay — do not treat the gap as a timeout.
|
||||||
|
- Use `expires_at` from the original command to determine when to abandon waiting.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Health Dashboard — Broker Connection Fields (Gap 2)
|
||||||
|
|
||||||
|
Every `infoscreen/{uuid}/health` payload now includes a `broker_connection` block:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"timestamp": "2026-04-05T08:00:00.000000+00:00",
|
||||||
|
"expected_state": { "event_id": 42 },
|
||||||
|
"actual_state": {
|
||||||
|
"process": "display_manager",
|
||||||
|
"pid": 1234,
|
||||||
|
"status": "running"
|
||||||
|
},
|
||||||
|
"broker_connection": {
|
||||||
|
"broker_reachable": true,
|
||||||
|
"reconnect_count": 2,
|
||||||
|
"last_disconnect_at": "2026-04-04T10:30:00Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Server must:**
|
||||||
|
- Display `reconnect_count` and `last_disconnect_at` per device in the health dashboard.
|
||||||
|
- Implement alerting heuristic:
|
||||||
|
- **All** clients go silent simultaneously → likely broker outage, not device crash.
|
||||||
|
- **Single** client goes silent → device crash, network failure, or process hang.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Service-Failed MQTT Notification (Gap 3)
|
||||||
|
|
||||||
|
When systemd gives up restarting a service after repeated crashes (`StartLimitBurst` exceeded), the client automatically publishes a **retained** message:
|
||||||
|
|
||||||
|
**Topic:** `infoscreen/{uuid}/service_failed`
|
||||||
|
|
||||||
|
**Payload:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"event": "service_failed",
|
||||||
|
"unit": "infoscreen-simclient.service",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"failed_at": "2026-04-05T08:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Server must:**
|
||||||
|
- Subscribe to `infoscreen/+/service_failed` on startup (retained — message survives broker restart).
|
||||||
|
- Alert the operator immediately when this topic receives a payload.
|
||||||
|
- **Clear the retained message** once the device is acknowledged or recovered:
|
||||||
|
```
|
||||||
|
mosquitto_pub -t "infoscreen/{uuid}/service_failed" -n --retain
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. No Server Action Required
|
||||||
|
|
||||||
|
These items are fully implemented client-side and require no server changes:
|
||||||
|
|
||||||
|
- systemd watchdog (`WatchdogSec=60`) — hangs detected and process restarted automatically.
|
||||||
|
- Command deduplication — `command_id` deduplicated with 24-hour TTL.
|
||||||
|
- Ack retry backoff — client retries ack publish on broker disconnect until `expires_at`.
|
||||||
|
- Mock helper / test mode (`COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE`) — development only.
|
||||||
88
TODO.md
88
TODO.md
@@ -25,10 +25,98 @@ This file tracks higher-level todos and design notes for the infoscreen client.
|
|||||||
- `set_volume()` issues appropriate CEC commands and returns success/failure.
|
- `set_volume()` issues appropriate CEC commands and returns success/failure.
|
||||||
- Document any platform limitations (some TVs don't support absolute volume via CEC).
|
- Document any platform limitations (some TVs don't support absolute volume via CEC).
|
||||||
|
|
||||||
|
## Systemd crash recovery (server team recommendation)
|
||||||
|
|
||||||
|
Reliable restart-on-crash for both processes must be handled by **systemd**, not by in-process watchdogs or ad-hoc shell scripts.
|
||||||
|
|
||||||
|
### What needs to be done
|
||||||
|
|
||||||
|
- `display_manager`: already has `scripts/infoscreen-display.service` with `Restart=on-failure` / `RestartSec=10`.
|
||||||
|
- Review `RestartSec` — may want a short backoff (e.g. 5–15 s) and `StartLimitIntervalSec` + `StartLimitBurst` to prevent thrash loops.
|
||||||
|
- `simclient`: **no service unit exists yet**.
|
||||||
|
- Create `scripts/infoscreen-simclient.service` modelled on the display service.
|
||||||
|
- Use `Restart=on-failure` and `RestartSec=10`.
|
||||||
|
- Wire `EnvironmentFile=/home/olafn/infoscreen-dev/.env` so the unit picks up `.env` variables automatically.
|
||||||
|
- Set `After=network-online.target` so MQTT connection is not attempted before the network is ready.
|
||||||
|
- Both units should be installed and enabled via `src/pi-setup.sh` (`systemctl enable --now`).
|
||||||
|
- After enabling, verify crash recovery with `kill -9 <pid>` and confirm systemd restarts the process within `RestartSec`.
|
||||||
|
|
||||||
|
### Acceptance criteria
|
||||||
|
|
||||||
|
- Both `simclient` and `display_manager` restart automatically within 15 s of any non-intentional exit.
|
||||||
|
- `systemctl status` shows `active (running)` after a crash-induced restart.
|
||||||
|
- `journalctl -u infoscreen-simclient` captures all process output (stdout + stderr).
|
||||||
|
- `pi-setup.sh` idempotently installs and enables both units.
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
|
||||||
|
- Use `Restart=on-failure` — restarts on crashes and signals but not on clean `systemctl stop`, preserving operator control during deployments.
|
||||||
|
- The reboot/shutdown command flow publishes `execution_started` and then exits intentionally; systemd will restart simclient, and the recovery logic in the heartbeat loop will emit `completed` on reconnect. This is the intended lifecycle.
|
||||||
|
|
||||||
|
## Process health observability gaps
|
||||||
|
|
||||||
|
Two scenarios are currently undetected or ambiguous from the server/frontend perspective.
|
||||||
|
|
||||||
|
### Gap 1: Hung / deadlocked process ✅ implemented
|
||||||
|
|
||||||
|
**Solution implemented:** Zero-dependency `_sd_notify()` helper writes directly to `NOTIFY_SOCKET` (raw Unix socket, no extra package). `READY=1` is sent when the heartbeat loop starts; `WATCHDOG=1` is sent every 5 s in the main loop iteration. The service unit uses `Type=notify` + `WatchdogSec=60` — if the main loop freezes for 60 s, systemd kills and restarts the process automatically.
|
||||||
|
|
||||||
|
**To apply on device:**
|
||||||
|
```bash
|
||||||
|
sudo cp ~/infoscreen-dev/scripts/infoscreen-simclient.service /etc/systemd/system/
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl restart infoscreen-simclient
|
||||||
|
```
|
||||||
|
|
||||||
|
### Gap 2: MQTT broker unreachable vs. simclient dead ✅ implemented (client side)
|
||||||
|
|
||||||
|
**Solution implemented:** `connection_state` dict expanded with `reconnect_count` and `connect_count`. `publish_health_message()` now accepts `connection_state` and appends a `broker_connection` block to every health payload:
|
||||||
|
|
||||||
|
```json
|
||||||
|
"broker_connection": {
|
||||||
|
"broker_reachable": true,
|
||||||
|
"reconnect_count": 2,
|
||||||
|
"last_disconnect_at": "2026-04-04T10:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`broker_reachable` = `true` when MQTT is connected at publish time.
|
||||||
|
`reconnect_count` increments on every reconnection (first connect does not count).
|
||||||
|
`last_disconnect_at` is the UTC timestamp of the most recent disconnect.
|
||||||
|
|
||||||
|
**Server-side action still needed:**
|
||||||
|
- Display `reconnect_count` and `last_disconnect_at` in the frontend health dashboard.
|
||||||
|
- Alert heuristic: if **all** clients go silent simultaneously → likely broker issue; if only one → likely device issue.
|
||||||
|
|
||||||
|
### Gap 3: systemd gives up (StartLimitBurst exceeded) ✅ implemented
|
||||||
|
|
||||||
|
**Solution implemented:** `scripts/infoscreen-notify-failure@.service` (template unit) + `scripts/infoscreen-notify-failure.sh`. Both main units have `OnFailure=infoscreen-notify-failure@%n.service`. When systemd marks a service `failed`, the notifier runs once, reads broker credentials from `.env`, reads `client_uuid.txt`, and publishes a retained JSON payload to `infoscreen/{uuid}/service_failed` via `mosquitto_pub`.
|
||||||
|
|
||||||
|
**To apply on device:**
|
||||||
|
```bash
|
||||||
|
sudo cp ~/infoscreen-dev/scripts/infoscreen-notify-failure@.service /etc/systemd/system/
|
||||||
|
sudo cp ~/infoscreen-dev/scripts/infoscreen-simclient.service /etc/systemd/system/
|
||||||
|
sudo cp ~/infoscreen-dev/scripts/infoscreen-display.service /etc/systemd/system/
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl restart infoscreen-simclient infoscreen-display
|
||||||
|
```
|
||||||
|
|
||||||
|
**Topic:** `infoscreen/{client_uuid}/service_failed` (retained)
|
||||||
|
**Payload:** `{"event":"service_failed","unit":"infoscreen-simclient.service","client_uuid":"...","failed_at":"2026-..."}`
|
||||||
|
|
||||||
## Next-high-level items
|
## Next-high-level items
|
||||||
|
|
||||||
- Add environment-controlled libVLC hw-accel toggle (`VLC_HW_ACCEL=1|0`) to `display_manager.py` so software decode can be forced when necessary.
|
- Add environment-controlled libVLC hw-accel toggle (`VLC_HW_ACCEL=1|0`) to `display_manager.py` so software decode can be forced when necessary.
|
||||||
- Add automated tests for video start/stop lifecycle (mock python-vlc) to ensure resources are released on event end.
|
- Add automated tests for video start/stop lifecycle (mock python-vlc) to ensure resources are released on event end.
|
||||||
|
- Add allowlist validation for `website` / `webuntis` event URLs
|
||||||
|
- Goal: restrict browser-based events to approved hosts and schemes even if an authenticated publisher sends an unsafe URL.
|
||||||
|
- Ideas / approaches:
|
||||||
|
- Add env-configurable allowlists for general website hosts and WebUntis hosts.
|
||||||
|
- Allow only `https` by default and reject `file:`, `data:`, `javascript:`, loopback, and private-address URLs unless explicitly allowed.
|
||||||
|
- Enforce the same validation on both server-side payload generation and client-side execution in `display_manager.py`.
|
||||||
|
- Acceptance criteria:
|
||||||
|
- Unsafe or unapproved URLs are rejected before Chromium is launched.
|
||||||
|
- WebUntis and approved website events still work with explicit allowlist configuration.
|
||||||
|
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|||||||
149
implementation-plans/reboot-command-payload-schemas.json
Normal file
149
implementation-plans/reboot-command-payload-schemas.json
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
{
|
||||||
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||||
|
"$id": "https://infoscreen.local/schemas/reboot-command-payload-schemas.json",
|
||||||
|
"title": "Infoscreen Reboot Command Payload Schemas",
|
||||||
|
"description": "Frozen v1 schemas for per-client command and command acknowledgement payloads.",
|
||||||
|
"$defs": {
|
||||||
|
"commandPayloadV1": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"schema_version",
|
||||||
|
"command_id",
|
||||||
|
"client_uuid",
|
||||||
|
"action",
|
||||||
|
"issued_at",
|
||||||
|
"expires_at",
|
||||||
|
"requested_by",
|
||||||
|
"reason"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"schema_version": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "1.0"
|
||||||
|
},
|
||||||
|
"command_id": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uuid"
|
||||||
|
},
|
||||||
|
"client_uuid": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uuid"
|
||||||
|
},
|
||||||
|
"action": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"reboot_host",
|
||||||
|
"shutdown_host"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"issued_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time"
|
||||||
|
},
|
||||||
|
"expires_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time"
|
||||||
|
},
|
||||||
|
"requested_by": {
|
||||||
|
"type": [
|
||||||
|
"integer",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"minimum": 1
|
||||||
|
},
|
||||||
|
"reason": {
|
||||||
|
"type": [
|
||||||
|
"string",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"maxLength": 2000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"commandAckPayloadV1": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"command_id",
|
||||||
|
"status",
|
||||||
|
"error_code",
|
||||||
|
"error_message"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"command_id": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uuid"
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"accepted",
|
||||||
|
"execution_started",
|
||||||
|
"completed",
|
||||||
|
"failed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"error_code": {
|
||||||
|
"type": [
|
||||||
|
"string",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"maxLength": 128
|
||||||
|
},
|
||||||
|
"error_message": {
|
||||||
|
"type": [
|
||||||
|
"string",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"maxLength": 4000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"allOf": [
|
||||||
|
{
|
||||||
|
"if": {
|
||||||
|
"properties": {
|
||||||
|
"status": {
|
||||||
|
"const": "failed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"then": {
|
||||||
|
"properties": {
|
||||||
|
"error_code": {
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"error_message": {
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"commandPayloadV1": {
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": "2026-04-03T12:48:10Z",
|
||||||
|
"expires_at": "2026-04-03T12:52:10Z",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"commandAckPayloadV1": {
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"status": "execution_started",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
59
implementation-plans/reboot-command-payload-schemas.md
Normal file
59
implementation-plans/reboot-command-payload-schemas.md
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
## Reboot Command Payload Schema Snippets
|
||||||
|
|
||||||
|
This file provides copy-ready validation snippets for client and integration test helpers.
|
||||||
|
|
||||||
|
### Canonical Topics (v1)
|
||||||
|
1. Command topic: infoscreen/{client_uuid}/commands
|
||||||
|
2. Ack topic: infoscreen/{client_uuid}/commands/ack
|
||||||
|
|
||||||
|
### Transitional Compatibility Topics
|
||||||
|
1. Command topic alias: infoscreen/{client_uuid}/command
|
||||||
|
2. Ack topic alias: infoscreen/{client_uuid}/command/ack
|
||||||
|
|
||||||
|
### Canonical Action Values
|
||||||
|
1. reboot_host
|
||||||
|
2. shutdown_host
|
||||||
|
|
||||||
|
### Ack Status Values
|
||||||
|
1. accepted
|
||||||
|
2. execution_started
|
||||||
|
3. completed
|
||||||
|
4. failed
|
||||||
|
|
||||||
|
### JSON Schema Source
|
||||||
|
Use this file for machine validation:
|
||||||
|
1. implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### Minimal Command Schema Snippet
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": ["schema_version", "command_id", "client_uuid", "action", "issued_at", "expires_at", "requested_by", "reason"],
|
||||||
|
"properties": {
|
||||||
|
"schema_version": { "const": "1.0" },
|
||||||
|
"command_id": { "type": "string", "format": "uuid" },
|
||||||
|
"client_uuid": { "type": "string", "format": "uuid" },
|
||||||
|
"action": { "enum": ["reboot_host", "shutdown_host"] },
|
||||||
|
"issued_at": { "type": "string", "format": "date-time" },
|
||||||
|
"expires_at": { "type": "string", "format": "date-time" },
|
||||||
|
"requested_by": { "type": ["integer", "null"] },
|
||||||
|
"reason": { "type": ["string", "null"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Minimal Ack Schema Snippet
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": ["command_id", "status", "error_code", "error_message"],
|
||||||
|
"properties": {
|
||||||
|
"command_id": { "type": "string", "format": "uuid" },
|
||||||
|
"status": { "enum": ["accepted", "execution_started", "completed", "failed"] },
|
||||||
|
"error_code": { "type": ["string", "null"] },
|
||||||
|
"error_message": { "type": ["string", "null"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
@@ -0,0 +1,169 @@
|
|||||||
|
## Client Team Implementation Spec (Raspberry Pi 5)
|
||||||
|
|
||||||
|
### Mission
|
||||||
|
Implement client-side command handling for reliable restart and shutdown with strict validation, idempotency, acknowledgements, and reboot recovery continuity.
|
||||||
|
|
||||||
|
### Ownership Boundaries
|
||||||
|
1. Client team owns command intake, execution, acknowledgement emission, and post-reboot continuity.
|
||||||
|
2. Platform team owns command issuance, lifecycle aggregation, and server-side escalation logic.
|
||||||
|
3. Client implementation must not assume managed PoE availability.
|
||||||
|
|
||||||
|
### Required Client Behaviors
|
||||||
|
|
||||||
|
### Frozen MQTT Topics and Schemas (v1)
|
||||||
|
1. Canonical command topic: infoscreen/{client_uuid}/commands.
|
||||||
|
2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack.
|
||||||
|
3. Transitional compatibility topics during migration:
|
||||||
|
- infoscreen/{client_uuid}/command
|
||||||
|
- infoscreen/{client_uuid}/command/ack
|
||||||
|
4. QoS policy: command QoS 1, ack QoS 1 recommended.
|
||||||
|
5. Retain policy: commands and acks are non-retained.
|
||||||
|
6. Client migration behavior: subscribe to both command topics and publish to both ack topics during migration.
|
||||||
|
|
||||||
|
Frozen command payload schema:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": "2026-04-03T12:48:10Z",
|
||||||
|
"expires_at": "2026-04-03T12:52:10Z",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Frozen ack payload schema:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"status": "execution_started",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Allowed ack status values:
|
||||||
|
1. accepted
|
||||||
|
2. execution_started
|
||||||
|
3. completed
|
||||||
|
4. failed
|
||||||
|
|
||||||
|
Frozen command action values for v1:
|
||||||
|
1. reboot_host
|
||||||
|
2. shutdown_host
|
||||||
|
|
||||||
|
Reserved but not emitted by server in v1:
|
||||||
|
1. restart_service
|
||||||
|
|
||||||
|
### Client Decision Defaults (v1)
|
||||||
|
1. Privileged helper invocation: sudoers + local helper script (`sudo /usr/local/bin/infoscreen-cmd-helper.sh`).
|
||||||
|
2. Dedupe retention: keep processed command IDs for 24 hours and cap store size to 5000 newest entries.
|
||||||
|
3. Ack retry schedule while broker unavailable: 0.5s, 1s, 2s, 4s, then 5s cap until expires_at.
|
||||||
|
4. Boot-loop handling: server remains authority for safety lockout; client enforces idempotency by command_id and reports local execution outcomes.
|
||||||
|
|
||||||
|
### MQTT Auth Hardening (Current Priority)
|
||||||
|
1. Client must support authenticated MQTT connections for both command and event intake.
|
||||||
|
2. Client must remain compatible with broker ACLs that restrict publish/subscribe rights per topic.
|
||||||
|
3. Client should support TLS broker connections from environment configuration when certificates are provided.
|
||||||
|
4. URL/domain allowlisting for web and webuntis events is explicitly deferred and tracked separately in TODO.md.
|
||||||
|
5. Client credentials are loaded from the local [/.env](.env), not from tracked docs or templates.
|
||||||
|
|
||||||
|
Server-side prerequisites for this client work:
|
||||||
|
1. Broker credentials must be provisioned for clients.
|
||||||
|
2. Broker ACLs must allow each client to subscribe only to its own command topics and assigned event topics.
|
||||||
|
3. Broker ACLs must allow each client to publish only its own ack, heartbeat, health, dashboard, and telemetry topics.
|
||||||
|
4. Server-side publishers must move to authenticated broker access before production rollout.
|
||||||
|
|
||||||
|
Validation snippets for helper scripts:
|
||||||
|
1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md
|
||||||
|
2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### 1. Command Intake
|
||||||
|
1. Subscribe to canonical and transitional command topics with QoS 1.
|
||||||
|
2. Parse required fields exactly: schema_version, command_id, client_uuid, action, issued_at, expires_at, requested_by, reason.
|
||||||
|
3. Reject invalid payloads with failed acknowledgement including error_code and diagnostic message.
|
||||||
|
4. Reject stale commands when current time exceeds expires_at.
|
||||||
|
5. Reject already-processed command_id values without re-execution.
|
||||||
|
|
||||||
|
### 2. Idempotency And Persistence
|
||||||
|
1. Persist processed command_id and execution result on local storage.
|
||||||
|
2. Persistence must survive service restart and full OS reboot.
|
||||||
|
3. On restart, reload dedupe cache before processing newly delivered commands.
|
||||||
|
|
||||||
|
### 3. Acknowledgement Contract Behavior
|
||||||
|
1. Emit accepted immediately after successful validation and dedupe pass.
|
||||||
|
2. Emit execution_started immediately before invoking the command action.
|
||||||
|
3. Emit completed only when local success condition is confirmed.
|
||||||
|
4. Emit failed with structured error_code on validation or execution failure.
|
||||||
|
5. If MQTT is temporarily unavailable, retry ack publish with bounded backoff until command expiry.
|
||||||
|
6. Ack payload fields are strict: command_id, status, error_code, error_message (no additional fields).
|
||||||
|
7. For status failed, error_code and error_message must be non-null, non-empty strings.
|
||||||
|
|
||||||
|
### 4. Execution Security Model
|
||||||
|
1. Execute via systemd-managed privileged helper.
|
||||||
|
2. Allow only whitelisted operations:
|
||||||
|
- reboot_host
|
||||||
|
- shutdown_host
|
||||||
|
3. Do not execute restart_service in v1.
|
||||||
|
4. Disallow arbitrary shell commands and untrusted arguments.
|
||||||
|
5. Enforce per-command execution timeout and terminate hung child processes.
|
||||||
|
|
||||||
|
### 5. Reboot Recovery Continuity
|
||||||
|
1. For reboot_host action:
|
||||||
|
- send execution_started
|
||||||
|
- trigger reboot promptly
|
||||||
|
2. During startup:
|
||||||
|
- emit heartbeat early
|
||||||
|
- emit process-health once service is ready
|
||||||
|
3. Keep last command execution state available after reboot for reconciliation.
|
||||||
|
|
||||||
|
### 6. Time And Timeout Semantics
|
||||||
|
1. Use monotonic timers for local elapsed-time checks.
|
||||||
|
2. Use UTC wall-clock only for protocol timestamps and expiry comparisons.
|
||||||
|
3. Target reconnect baseline on Pi 5 USB-SATA SSD: 90 seconds.
|
||||||
|
4. Accept cold-boot and USB enumeration ceiling up to 150 seconds.
|
||||||
|
|
||||||
|
### 7. Capability Reporting
|
||||||
|
1. Report recovery capability class:
|
||||||
|
- software_only
|
||||||
|
- managed_poe_available
|
||||||
|
- manual_only
|
||||||
|
2. Report watchdog enabled status.
|
||||||
|
3. Report boot-source metadata for diagnostics.
|
||||||
|
|
||||||
|
### 8. Error Codes Minimum Set
|
||||||
|
1. invalid_schema
|
||||||
|
2. missing_field
|
||||||
|
3. stale_command
|
||||||
|
4. duplicate_command
|
||||||
|
5. permission_denied_local
|
||||||
|
6. execution_timeout
|
||||||
|
7. execution_failed
|
||||||
|
8. broker_unavailable
|
||||||
|
9. internal_error
|
||||||
|
|
||||||
|
### Acceptance Tests (Client Team)
|
||||||
|
1. Invalid schema payload is rejected and failed ack emitted.
|
||||||
|
2. Expired command is rejected and not executed.
|
||||||
|
3. Duplicate command_id is not executed twice.
|
||||||
|
4. reboot_host emits execution_started and reconnects with heartbeat in expected window.
|
||||||
|
5. shutdown_host action is accepted and invokes local privileged helper without accepting non-whitelisted actions.
|
||||||
|
6. MQTT outage during ack path retries correctly without duplicate execution.
|
||||||
|
7. Client idempotency cooperates with server-side lockout semantics (no local reboot-rate policy).
|
||||||
|
8. Client connects successfully to an authenticated broker and still receives commands and event topics permitted by ACLs.
|
||||||
|
|
||||||
|
### Delivery Artifacts
|
||||||
|
1. Client protocol conformance checklist.
|
||||||
|
2. Test evidence for all acceptance tests.
|
||||||
|
3. Runtime logs showing full lifecycle for one shutdown and one reboot scenario.
|
||||||
|
4. Known limitations list per image version.
|
||||||
|
|
||||||
|
### Definition Of Done
|
||||||
|
1. All acceptance tests pass on Pi 5 USB-SATA SSD test devices.
|
||||||
|
2. No duplicate execution observed under reconnect and retained-delivery edge cases.
|
||||||
|
3. Acknowledgement sequence is complete and machine-parseable for server correlation.
|
||||||
|
4. Reboot recovery continuity works without managed PoE dependencies.
|
||||||
175
implementation-plans/reboot-implementation-handoff-share.md
Normal file
175
implementation-plans/reboot-implementation-handoff-share.md
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
## Remote Reboot Reliability Handoff (Share Document)
|
||||||
|
|
||||||
|
### Purpose
|
||||||
|
This document defines the agreed implementation scope for reliable remote reboot and shutdown of Raspberry Pi 5 clients, with monitoring-first visibility and safe escalation paths.
|
||||||
|
|
||||||
|
### Scope
|
||||||
|
1. In scope: restart and shutdown command reliability.
|
||||||
|
2. In scope: full lifecycle monitoring and audit visibility.
|
||||||
|
3. In scope: capability-tier recovery model with optional managed PoE escalation.
|
||||||
|
4. Out of scope: broader maintenance module in client-management for this cycle.
|
||||||
|
5. Out of scope: mandatory dependency on customer-managed power switching.
|
||||||
|
|
||||||
|
### Agreed Operating Model
|
||||||
|
1. Command delivery is asynchronous and lifecycle-tracked, not fire-and-forget.
|
||||||
|
2. Commands use idempotent command_id semantics with stale-command rejection by expires_at.
|
||||||
|
3. Monitoring is authoritative for operational state and escalation decisions.
|
||||||
|
4. Recovery must function even when no managed power switching is available.
|
||||||
|
|
||||||
|
### Frozen Contract v1 (Effective Immediately)
|
||||||
|
1. Canonical command topic: infoscreen/{client_uuid}/commands.
|
||||||
|
2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack.
|
||||||
|
3. Transitional compatibility topics accepted during migration:
|
||||||
|
- infoscreen/{client_uuid}/command
|
||||||
|
- infoscreen/{client_uuid}/command/ack
|
||||||
|
4. QoS policy: command QoS 1, ack QoS 1 recommended.
|
||||||
|
5. Retain policy: commands and acks are non-retained.
|
||||||
|
|
||||||
|
Command payload schema (frozen):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": "2026-04-03T12:48:10Z",
|
||||||
|
"expires_at": "2026-04-03T12:52:10Z",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Ack payload schema (frozen):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"status": "execution_started",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Allowed ack status values:
|
||||||
|
1. accepted
|
||||||
|
2. execution_started
|
||||||
|
3. completed
|
||||||
|
4. failed
|
||||||
|
|
||||||
|
Frozen command action values:
|
||||||
|
1. reboot_host
|
||||||
|
2. shutdown_host
|
||||||
|
|
||||||
|
API endpoint mapping:
|
||||||
|
1. POST /api/clients/{uuid}/restart -> action reboot_host
|
||||||
|
2. POST /api/clients/{uuid}/shutdown -> action shutdown_host
|
||||||
|
|
||||||
|
Validation snippets:
|
||||||
|
1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md
|
||||||
|
2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### Command Lifecycle States
|
||||||
|
1. queued
|
||||||
|
2. publish_in_progress
|
||||||
|
3. published
|
||||||
|
4. ack_received
|
||||||
|
5. execution_started
|
||||||
|
6. awaiting_reconnect
|
||||||
|
7. recovered
|
||||||
|
8. completed
|
||||||
|
9. failed
|
||||||
|
10. expired
|
||||||
|
11. timed_out
|
||||||
|
12. canceled
|
||||||
|
13. blocked_safety
|
||||||
|
14. manual_intervention_required
|
||||||
|
|
||||||
|
### Timeout Defaults (Pi 5, USB-SATA SSD baseline)
|
||||||
|
1. queued to publish_in_progress: immediate, timeout 5 seconds.
|
||||||
|
2. publish_in_progress to published: timeout 8 seconds.
|
||||||
|
3. published to ack_received: timeout 20 seconds.
|
||||||
|
4. ack_received to execution_started: 15 seconds for service restart, 25 seconds for host reboot.
|
||||||
|
5. execution_started to awaiting_reconnect: timeout 10 seconds.
|
||||||
|
6. awaiting_reconnect to recovered: baseline 90 seconds after validation, cold-boot ceiling 150 seconds.
|
||||||
|
7. recovered to completed: 15 to 20 seconds based on fleet stability.
|
||||||
|
8. command expires_at default: 240 seconds, bounded 180 to 360 seconds.
|
||||||
|
|
||||||
|
### Recovery Tiers
|
||||||
|
1. Tier 0 baseline, always required: watchdog, systemd auto-restart, lifecycle tracking, manual intervention fallback.
|
||||||
|
2. Tier 1 optional: managed PoE per-port power-cycle escalation where customer infrastructure supports it.
|
||||||
|
3. Tier 2 no remote power control: direct manual intervention workflow.
|
||||||
|
|
||||||
|
### Governance And Safety
|
||||||
|
1. Role access: admin and superadmin.
|
||||||
|
2. Bulk actions require reason capture.
|
||||||
|
3. Safety lockout: maximum 3 reboot commands per client in 15 minutes.
|
||||||
|
4. Escalation cooldown: 60 seconds before automatic move to manual_intervention_required.
|
||||||
|
|
||||||
|
### MQTT Auth Hardening (Phase 1, Required Before Broad Rollout)
|
||||||
|
1. Intranet-only deployment is not sufficient protection for privileged MQTT actions by itself.
|
||||||
|
2. Phase 1 hardening scope is broker authentication, authorization, and network restriction; payload URL allowlisting is deferred to a later client/server feature.
|
||||||
|
3. MQTT broker must disable anonymous publish/subscribe access in production.
|
||||||
|
4. MQTT broker must require authenticated identities for server-side publishers and client devices.
|
||||||
|
5. MQTT broker must enforce ACLs so that:
|
||||||
|
- only server-side services can publish to `infoscreen/{client_uuid}/commands`
|
||||||
|
- only server-side services can publish scheduler event topics
|
||||||
|
- each client can subscribe only to its own command topics and assigned event topics
|
||||||
|
- each client can publish only its own ack, heartbeat, health, dashboard, and telemetry topics
|
||||||
|
6. Broker port exposure must be restricted to the management network and approved hosts only.
|
||||||
|
7. TLS support is strongly recommended in this phase and should be enabled when operationally feasible.
|
||||||
|
|
||||||
|
### Server Team Actions For Auth Hardening
|
||||||
|
1. Provision broker credentials for command/event publishers and for client devices.
|
||||||
|
2. Configure Mosquitto or equivalent broker ACLs for per-topic publish and subscribe restrictions.
|
||||||
|
3. Disable anonymous access on production brokers.
|
||||||
|
4. Restrict broker network exposure with firewall rules, VLAN policy, or equivalent network controls.
|
||||||
|
5. Update server/frontend deployment to publish MQTT with authenticated credentials.
|
||||||
|
6. Validate that server-side event publishing and reboot/shutdown command publishing still work under the new ACL policy.
|
||||||
|
7. Coordinate credential distribution and rotation with the client deployment process.
|
||||||
|
|
||||||
|
### Credential Management Guidance
|
||||||
|
1. Real MQTT passwords must not be stored in tracked documentation or committed templates.
|
||||||
|
2. Each client device should receive a unique broker username and password, stored only in its local [/.env](.env).
|
||||||
|
3. Server-side publisher credentials should be stored in the server team's secret-management path, not in source control.
|
||||||
|
4. Recommended naming convention for client broker users: `infoscreen-client-<client-uuid-prefix>`.
|
||||||
|
5. Client passwords should be random, at least 20 characters, and rotated through deployment tooling or broker administration procedures.
|
||||||
|
6. The server/infrastructure team owns broker-side user creation, ACL assignment, rotation, and revocation.
|
||||||
|
7. The client team owns loading credentials from local env files and validating connection behavior against the secured broker.
|
||||||
|
|
||||||
|
### Client Team Actions For Auth Hardening
|
||||||
|
1. Add MQTT username/password support in the client connection setup.
|
||||||
|
2. Add client-side TLS configuration support from environment when certificates are provided.
|
||||||
|
3. Update local test helpers to support authenticated MQTT publishing and subscription.
|
||||||
|
4. Validate command and event intake against the authenticated broker configuration before canary rollout.
|
||||||
|
|
||||||
|
### Ready For Server/Frontend Team (Auth Phase)
|
||||||
|
1. Client implementation is ready to connect with MQTT auth from local `.env` (`MQTT_USERNAME`, `MQTT_PASSWORD`, optional TLS settings).
|
||||||
|
2. Client command/event intake and client ack/telemetry publishing run over the authenticated MQTT session.
|
||||||
|
3. Server/frontend team must now complete broker-side enforcement and publisher migration.
|
||||||
|
|
||||||
|
Server/frontend done criteria:
|
||||||
|
1. Anonymous broker access is disabled in production.
|
||||||
|
2. Server-side publishers use authenticated broker credentials.
|
||||||
|
3. ACLs are active and validated for command, event, and client telemetry topics.
|
||||||
|
4. At least one canary client proves end-to-end flow under ACLs:
|
||||||
|
- server publishes command/event with authenticated publisher
|
||||||
|
- client receives payload
|
||||||
|
- client sends ack/telemetry successfully
|
||||||
|
5. Revocation test passes: disabling one client credential blocks only that client without impacting others.
|
||||||
|
|
||||||
|
Operational note:
|
||||||
|
1. Client-side auth support is necessary but not sufficient by itself; broker ACL/auth enforcement is the security control that must be enabled by the server/infrastructure team.
|
||||||
|
|
||||||
|
### Rollout Plan
|
||||||
|
1. Contract freeze and sign-off.
|
||||||
|
2. Platform and client implementation against frozen schemas.
|
||||||
|
3. One-group canary.
|
||||||
|
4. Rollback if failed plus timed_out exceeds 5 percent.
|
||||||
|
5. Expand only after 7 days below intervention threshold.
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
1. Deterministic command lifecycle visibility from enqueue to completion.
|
||||||
|
2. No duplicate execution under reconnect or delayed-delivery conditions.
|
||||||
|
3. Stable Pi 5 SSD reconnect behavior within defined baseline.
|
||||||
|
4. Clear and actionable manual intervention states when automatic recovery is exhausted.
|
||||||
54
implementation-plans/reboot-kickoff-summary.md
Normal file
54
implementation-plans/reboot-kickoff-summary.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
## Reboot Reliability Kickoff Summary
|
||||||
|
|
||||||
|
### Objective
|
||||||
|
Ship a reliable, observable restart and shutdown workflow for Raspberry Pi 5 clients, with safe escalation and clear operator outcomes.
|
||||||
|
|
||||||
|
### What Is Included
|
||||||
|
1. Asynchronous command lifecycle with idempotent command_id handling.
|
||||||
|
2. Monitoring-first state visibility from queued to terminal outcomes.
|
||||||
|
3. Client acknowledgements for accepted, execution_started, completed, and failed.
|
||||||
|
4. Pi 5 USB-SATA SSD timeout baseline and tuning rules.
|
||||||
|
5. Capability-tier recovery with optional managed PoE escalation.
|
||||||
|
|
||||||
|
### What Is Not Included
|
||||||
|
1. Full maintenance module in client-management.
|
||||||
|
2. Required managed power-switch integration.
|
||||||
|
3. Production Wake-on-LAN rollout.
|
||||||
|
|
||||||
|
### Team Split
|
||||||
|
1. Platform team: API command lifecycle, safety controls, listener ack ingestion.
|
||||||
|
2. Web team: lifecycle-aware UX and command status display.
|
||||||
|
3. Client team: strict validation, dedupe, ack sequence, secure execution helper, reboot continuity.
|
||||||
|
|
||||||
|
### Ownership Matrix
|
||||||
|
| Team | Primary Plan File | Main Deliverables |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Platform team | implementation-plans/reboot-implementation-handoff-share.md | Command lifecycle backend, policy enforcement, listener ack mapping, safety lockout and escalation |
|
||||||
|
| Web team | implementation-plans/reboot-implementation-handoff-share.md | Lifecycle UI states, bulk safety UX, capability visibility, command status polling |
|
||||||
|
| Client team | implementation-plans/reboot-implementation-handoff-client-team.md | Command validation, dedupe persistence, ack sequence, secure execution helper, reboot continuity |
|
||||||
|
| Project coordination | implementation-plans/reboot-kickoff-summary.md | Phase sequencing, canary gates, rollback thresholds, cross-team sign-off tracking |
|
||||||
|
|
||||||
|
### Baseline Operational Defaults
|
||||||
|
1. Safety lockout: 3 reboot commands per client in rolling 15 minutes.
|
||||||
|
2. Escalation cooldown: 60 seconds.
|
||||||
|
3. Reconnect target on Pi 5 SSD: 90 seconds baseline, 150 seconds cold-boot ceiling.
|
||||||
|
4. Rollback canary trigger: failed plus timed_out above 5 percent.
|
||||||
|
|
||||||
|
### Frozen Contract Snapshot
|
||||||
|
1. Canonical command topic: infoscreen/{client_uuid}/commands.
|
||||||
|
2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack.
|
||||||
|
3. Transitional compatibility topics during migration:
|
||||||
|
- infoscreen/{client_uuid}/command
|
||||||
|
- infoscreen/{client_uuid}/command/ack
|
||||||
|
4. Command schema version: 1.0.
|
||||||
|
5. Allowed command actions: reboot_host, shutdown_host.
|
||||||
|
6. Allowed ack status values: accepted, execution_started, completed, failed.
|
||||||
|
7. Validation snippets:
|
||||||
|
- implementation-plans/reboot-command-payload-schemas.md
|
||||||
|
- implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### Immediate Next Steps
|
||||||
|
1. Continue implementation in parallel by team against frozen contract.
|
||||||
|
2. Client team validates dedupe and expiry handling on canonical topics.
|
||||||
|
3. Platform team verifies ack-state transitions for accepted, execution_started, completed, failed.
|
||||||
|
4. Execute one-group canary and validate timing plus failure drills.
|
||||||
27
scripts/infoscreen-cmd-helper.sh
Executable file
27
scripts/infoscreen-cmd-helper.sh
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Privileged command helper for remote reboot/shutdown actions.
|
||||||
|
# Intended installation path: /usr/local/bin/infoscreen-cmd-helper.sh
|
||||||
|
# Suggested sudoers entry:
|
||||||
|
# infoscreen ALL=(ALL) NOPASSWD: /usr/local/bin/infoscreen-cmd-helper.sh
|
||||||
|
|
||||||
|
if [[ $# -ne 1 ]]; then
|
||||||
|
echo "usage: infoscreen-cmd-helper.sh <reboot_host|shutdown_host>" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
action="$1"
|
||||||
|
|
||||||
|
case "$action" in
|
||||||
|
reboot_host)
|
||||||
|
exec systemctl reboot
|
||||||
|
;;
|
||||||
|
shutdown_host)
|
||||||
|
exec systemctl poweroff
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unsupported action: $action" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@@ -3,6 +3,8 @@ Description=Infoscreen Display Manager
|
|||||||
Documentation=https://github.com/RobbStarkAustria/infoscreen_client_2025
|
Documentation=https://github.com/RobbStarkAustria/infoscreen_client_2025
|
||||||
After=network.target graphical.target
|
After=network.target graphical.target
|
||||||
Wants=network-online.target
|
Wants=network-online.target
|
||||||
|
# Publish an MQTT alert if systemd gives up restarting (StartLimitBurst exceeded).
|
||||||
|
OnFailure=infoscreen-notify-failure@%n.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
|||||||
55
scripts/infoscreen-notify-failure.sh
Executable file
55
scripts/infoscreen-notify-failure.sh
Executable file
@@ -0,0 +1,55 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Publishes a service-failed MQTT notification when called by systemd OnFailure=.
|
||||||
|
# Usage: infoscreen-notify-failure.sh <failing-unit-name>
|
||||||
|
#
|
||||||
|
# Designed to be called from infoscreen-notify-failure@.service.
|
||||||
|
# Reads broker credentials from .env; reads client UUID from config.
|
||||||
|
# Safe to run even if MQTT is unreachable (exits cleanly, errors logged to journal).
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
FAILING_UNIT="${1:-unknown}"
|
||||||
|
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
ENV_FILE="$PROJECT_DIR/.env"
|
||||||
|
UUID_FILE="$PROJECT_DIR/src/config/client_uuid.txt"
|
||||||
|
|
||||||
|
# Load .env (skip comments and blank lines)
|
||||||
|
if [[ -f "$ENV_FILE" ]]; then
|
||||||
|
set -a
|
||||||
|
# shellcheck source=/dev/null
|
||||||
|
source <(grep -v '^\s*#' "$ENV_FILE" | grep -v '^\s*$')
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
MQTT_BROKER="${MQTT_BROKER:-localhost}"
|
||||||
|
MQTT_PORT="${MQTT_PORT:-1883}"
|
||||||
|
MQTT_USER="${MQTT_USER:-}"
|
||||||
|
MQTT_PASSWORD_BROKER="${MQTT_PASSWORD_BROKER:-}"
|
||||||
|
|
||||||
|
CLIENT_UUID="unknown"
|
||||||
|
if [[ -f "$UUID_FILE" ]]; then
|
||||||
|
CLIENT_UUID="$(cat "$UUID_FILE" | tr -d '[:space:]')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
TOPIC="infoscreen/${CLIENT_UUID}/service_failed"
|
||||||
|
TIMESTAMP="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||||
|
|
||||||
|
PAYLOAD=$(printf '{"event":"service_failed","unit":"%s","client_uuid":"%s","failed_at":"%s"}' \
|
||||||
|
"$FAILING_UNIT" "$CLIENT_UUID" "$TIMESTAMP")
|
||||||
|
|
||||||
|
# Build mosquitto_pub auth args
|
||||||
|
AUTH_ARGS=()
|
||||||
|
if [[ -n "$MQTT_USER" ]]; then AUTH_ARGS+=(-u "$MQTT_USER"); fi
|
||||||
|
if [[ -n "$MQTT_PASSWORD_BROKER" ]]; then AUTH_ARGS+=(-P "$MQTT_PASSWORD_BROKER"); fi
|
||||||
|
|
||||||
|
echo "Publishing service-failed notification: unit=$FAILING_UNIT client=$CLIENT_UUID"
|
||||||
|
|
||||||
|
mosquitto_pub \
|
||||||
|
-h "$MQTT_BROKER" \
|
||||||
|
-p "$MQTT_PORT" \
|
||||||
|
"${AUTH_ARGS[@]}" \
|
||||||
|
-t "$TOPIC" \
|
||||||
|
-m "$PAYLOAD" \
|
||||||
|
-q 1 \
|
||||||
|
--retain \
|
||||||
|
2>&1 || echo "WARNING: mosquitto_pub failed (broker unreachable?); notification not delivered"
|
||||||
19
scripts/infoscreen-notify-failure@.service
Normal file
19
scripts/infoscreen-notify-failure@.service
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Infoscreen service-failed MQTT notifier (%i)
|
||||||
|
# One-shot: run once and exit. %i is the failing unit name passed by OnFailure=.
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
User=olafn
|
||||||
|
Group=olafn
|
||||||
|
WorkingDirectory=/home/olafn/infoscreen-dev
|
||||||
|
EnvironmentFile=/home/olafn/infoscreen-dev/.env
|
||||||
|
ExecStart=/home/olafn/infoscreen-dev/scripts/infoscreen-notify-failure.sh %i
|
||||||
|
|
||||||
|
# Do not restart the notifier itself.
|
||||||
|
Restart=no
|
||||||
|
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=infoscreen-notify-failure
|
||||||
55
scripts/infoscreen-simclient.service
Normal file
55
scripts/infoscreen-simclient.service
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Infoscreen Simclient (MQTT communication)
|
||||||
|
Documentation=https://github.com/RobbStarkAustria/infoscreen_client_2025
|
||||||
|
# Simclient needs network before starting — MQTT will fail otherwise.
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
# Publish an MQTT alert if systemd gives up restarting (StartLimitBurst exceeded).
|
||||||
|
OnFailure=infoscreen-notify-failure@%n.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
# notify: simclient sends READY=1 via sd_notify once fully initialised.
|
||||||
|
# WatchdogSec: if WATCHDOG=1 is not sent within this window, systemd kills
|
||||||
|
# and restarts the process — detects hung/deadlocked main loops.
|
||||||
|
Type=notify
|
||||||
|
WatchdogSec=60
|
||||||
|
User=olafn
|
||||||
|
Group=olafn
|
||||||
|
WorkingDirectory=/home/olafn/infoscreen-dev
|
||||||
|
|
||||||
|
# Load all client configuration from the local .env file.
|
||||||
|
# Keep .env mode 600; systemd reads it as root before dropping privileges.
|
||||||
|
EnvironmentFile=/home/olafn/infoscreen-dev/.env
|
||||||
|
|
||||||
|
# Start simclient
|
||||||
|
ExecStart=/home/olafn/infoscreen-dev/scripts/start-simclient.sh
|
||||||
|
|
||||||
|
# Restart on failure (non-zero exit or signal).
|
||||||
|
# This covers crash recovery AND the reboot-command lifecycle:
|
||||||
|
# 1. Server sends reboot_host command
|
||||||
|
# 2. Simclient publishes accepted + execution_started, then exits
|
||||||
|
# 3. Systemd restarts simclient within RestartSec seconds
|
||||||
|
# 4. On reconnect, heartbeat loop detects pending_recovery_command and
|
||||||
|
# publishes completed — closing the lifecycle cleanly.
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
# Prevent rapid restart thrash: allow at most 5 restarts in 60 seconds.
|
||||||
|
StartLimitIntervalSec=60
|
||||||
|
StartLimitBurst=5
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=infoscreen-simclient
|
||||||
|
|
||||||
|
# Security settings
|
||||||
|
NoNewPrivileges=true
|
||||||
|
PrivateTmp=true
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
# Simclient runs in multi-user mode — no graphical session required.
|
||||||
|
WantedBy=multi-user.target
|
||||||
24
scripts/install-command-helper.sh
Executable file
24
scripts/install-command-helper.sh
Executable file
@@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Installs the privileged command helper and sudoers drop-in.
|
||||||
|
# Usage: ./scripts/install-command-helper.sh [linux-user]
|
||||||
|
|
||||||
|
target_user="${1:-$USER}"
|
||||||
|
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
helper_src="$script_dir/infoscreen-cmd-helper.sh"
|
||||||
|
helper_dst="/usr/local/bin/infoscreen-cmd-helper.sh"
|
||||||
|
sudoers_file="/etc/sudoers.d/infoscreen-command-helper"
|
||||||
|
|
||||||
|
if [[ ! -f "$helper_src" ]]; then
|
||||||
|
echo "helper source not found: $helper_src" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sudo install -m 0755 "$helper_src" "$helper_dst"
|
||||||
|
printf '%s\n' "$target_user ALL=(ALL) NOPASSWD: $helper_dst" | sudo tee "$sudoers_file" >/dev/null
|
||||||
|
sudo chmod 0440 "$sudoers_file"
|
||||||
|
sudo visudo -cf "$sudoers_file" >/dev/null
|
||||||
|
|
||||||
|
echo "Installed helper: $helper_dst"
|
||||||
|
echo "Installed sudoers: $sudoers_file (user: $target_user)"
|
||||||
34
scripts/mock-command-helper.sh
Executable file
34
scripts/mock-command-helper.sh
Executable file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Non-destructive helper for command lifecycle canary tests.
|
||||||
|
# Use by starting simclient with:
|
||||||
|
# COMMAND_HELPER_PATH=/home/olafn/infoscreen-dev/scripts/mock-command-helper.sh
|
||||||
|
|
||||||
|
if [[ $# -ne 1 ]]; then
|
||||||
|
echo "usage: mock-command-helper.sh <reboot_host|shutdown_host>" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
action="$1"
|
||||||
|
|
||||||
|
case "$action" in
|
||||||
|
reboot_host|shutdown_host)
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unsupported action: $action" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if [[ "${MOCK_COMMAND_HELPER_FORCE_FAIL:-0}" == "1" ]]; then
|
||||||
|
echo "forced failure for canary test (action=$action)" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${MOCK_COMMAND_HELPER_SLEEP_SEC:-0}" != "0" ]]; then
|
||||||
|
sleep "${MOCK_COMMAND_HELPER_SLEEP_SEC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "mock helper executed action=$action"
|
||||||
|
exit 0
|
||||||
35
scripts/start-simclient.sh
Executable file
35
scripts/start-simclient.sh
Executable file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Start Simclient - MQTT communication and event intake for infoscreen
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||||
|
VENV_PATH="$PROJECT_ROOT/venv"
|
||||||
|
SIMCLIENT="$PROJECT_ROOT/src/simclient.py"
|
||||||
|
|
||||||
|
echo "📡 Starting Simclient..."
|
||||||
|
echo "Project root: $PROJECT_ROOT"
|
||||||
|
|
||||||
|
# Check if virtual environment exists
|
||||||
|
if [ ! -d "$VENV_PATH" ]; then
|
||||||
|
echo "❌ Virtual environment not found at: $VENV_PATH"
|
||||||
|
echo "Please create it with: python3 -m venv venv"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
source "$VENV_PATH/bin/activate"
|
||||||
|
|
||||||
|
# Check if simclient.py exists
|
||||||
|
if [ ! -f "$SIMCLIENT" ]; then
|
||||||
|
echo "❌ Simclient not found at: $SIMCLIENT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ENV="${ENV:-development}"
|
||||||
|
echo "Environment: $ENV"
|
||||||
|
|
||||||
|
echo "Starting simclient..."
|
||||||
|
echo "---"
|
||||||
|
exec python3 "$SIMCLIENT"
|
||||||
@@ -1,9 +1,27 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source "$(dirname "$0")/../.env"
|
source "$(dirname "$0")/../.env"
|
||||||
|
|
||||||
|
MQTT_AUTH_ARGS=()
|
||||||
|
MQTT_TLS_ARGS=()
|
||||||
|
|
||||||
|
if [[ -n "${MQTT_USERNAME:-}" ]]; then
|
||||||
|
MQTT_AUTH_ARGS+=( -u "$MQTT_USERNAME" )
|
||||||
|
fi
|
||||||
|
if [[ -n "${MQTT_PASSWORD:-}" ]]; then
|
||||||
|
MQTT_AUTH_ARGS+=( -P "$MQTT_PASSWORD" )
|
||||||
|
fi
|
||||||
|
if [[ "${MQTT_TLS_ENABLED:-0}" == "1" || "${MQTT_TLS_ENABLED:-0}" == "true" || "${MQTT_TLS_ENABLED:-0}" == "yes" ]]; then
|
||||||
|
[[ -n "${MQTT_TLS_CA_CERT:-}" ]] && MQTT_TLS_ARGS+=( --cafile "$MQTT_TLS_CA_CERT" )
|
||||||
|
[[ -n "${MQTT_TLS_CERT:-}" ]] && MQTT_TLS_ARGS+=( --cert "$MQTT_TLS_CERT" )
|
||||||
|
[[ -n "${MQTT_TLS_KEY:-}" ]] && MQTT_TLS_ARGS+=( --key "$MQTT_TLS_KEY" )
|
||||||
|
if [[ "${MQTT_TLS_INSECURE:-0}" == "1" || "${MQTT_TLS_INSECURE:-0}" == "true" || "${MQTT_TLS_INSECURE:-0}" == "yes" ]]; then
|
||||||
|
MQTT_TLS_ARGS+=( --insecure )
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Testing MQTT connection to $MQTT_BROKER:$MQTT_PORT"
|
echo "Testing MQTT connection to $MQTT_BROKER:$MQTT_PORT"
|
||||||
echo "Publishing test message..."
|
echo "Publishing test message..."
|
||||||
mosquitto_pub -h "$MQTT_BROKER" -p "$MQTT_PORT" -t "infoscreen/test" -m "Hello from Pi development setup"
|
mosquitto_pub -h "$MQTT_BROKER" -p "$MQTT_PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "infoscreen/test" -m "Hello from Pi development setup"
|
||||||
|
|
||||||
echo "Subscribing to test topic (press Ctrl+C to stop)..."
|
echo "Subscribing to test topic (press Ctrl+C to stop)..."
|
||||||
mosquitto_sub -h "$MQTT_BROKER" -p "$MQTT_PORT" -t "infoscreen/test"
|
mosquitto_sub -h "$MQTT_BROKER" -p "$MQTT_PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "infoscreen/test"
|
||||||
|
|||||||
@@ -36,6 +36,31 @@ fi
|
|||||||
|
|
||||||
BROKER="${MQTT_BROKER:-localhost}"
|
BROKER="${MQTT_BROKER:-localhost}"
|
||||||
PORT="${MQTT_PORT:-1883}"
|
PORT="${MQTT_PORT:-1883}"
|
||||||
|
MQTT_USERNAME="${MQTT_USERNAME:-}"
|
||||||
|
MQTT_PASSWORD="${MQTT_PASSWORD:-}"
|
||||||
|
MQTT_TLS_ENABLED="${MQTT_TLS_ENABLED:-0}"
|
||||||
|
MQTT_TLS_CA_CERT="${MQTT_TLS_CA_CERT:-}"
|
||||||
|
MQTT_TLS_CERT="${MQTT_TLS_CERT:-}"
|
||||||
|
MQTT_TLS_KEY="${MQTT_TLS_KEY:-}"
|
||||||
|
MQTT_TLS_INSECURE="${MQTT_TLS_INSECURE:-0}"
|
||||||
|
|
||||||
|
MQTT_AUTH_ARGS=()
|
||||||
|
MQTT_TLS_ARGS=()
|
||||||
|
|
||||||
|
if [[ -n "$MQTT_USERNAME" ]]; then
|
||||||
|
MQTT_AUTH_ARGS+=( -u "$MQTT_USERNAME" )
|
||||||
|
fi
|
||||||
|
if [[ -n "$MQTT_PASSWORD" ]]; then
|
||||||
|
MQTT_AUTH_ARGS+=( -P "$MQTT_PASSWORD" )
|
||||||
|
fi
|
||||||
|
if [[ "$MQTT_TLS_ENABLED" == "1" || "$MQTT_TLS_ENABLED" == "true" || "$MQTT_TLS_ENABLED" == "yes" ]]; then
|
||||||
|
[[ -n "$MQTT_TLS_CA_CERT" ]] && MQTT_TLS_ARGS+=( --cafile "$MQTT_TLS_CA_CERT" )
|
||||||
|
[[ -n "$MQTT_TLS_CERT" ]] && MQTT_TLS_ARGS+=( --cert "$MQTT_TLS_CERT" )
|
||||||
|
[[ -n "$MQTT_TLS_KEY" ]] && MQTT_TLS_ARGS+=( --key "$MQTT_TLS_KEY" )
|
||||||
|
if [[ "$MQTT_TLS_INSECURE" == "1" || "$MQTT_TLS_INSECURE" == "true" || "$MQTT_TLS_INSECURE" == "yes" ]]; then
|
||||||
|
MQTT_TLS_ARGS+=( --insecure )
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# ── Read runtime IDs ─────────────────────────────────────────────────────────
|
# ── Read runtime IDs ─────────────────────────────────────────────────────────
|
||||||
GROUP_ID_FILE="$PROJECT_ROOT/src/config/last_group_id.txt"
|
GROUP_ID_FILE="$PROJECT_ROOT/src/config/last_group_id.txt"
|
||||||
@@ -107,7 +132,7 @@ EOF
|
|||||||
echo -e "${YELLOW}Publishing to: $topic${NC}"
|
echo -e "${YELLOW}Publishing to: $topic${NC}"
|
||||||
echo "$payload" | python3 -m json.tool 2>/dev/null || echo "$payload"
|
echo "$payload" | python3 -m json.tool 2>/dev/null || echo "$payload"
|
||||||
echo ""
|
echo ""
|
||||||
mosquitto_pub -h "$BROKER" -p "$PORT" -t "$topic" -q 1 --retain -m "$payload"
|
mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$topic" -q 1 --retain -m "$payload"
|
||||||
echo -e "${GREEN}Published (retained, QoS 1)${NC}"
|
echo -e "${GREEN}Published (retained, QoS 1)${NC}"
|
||||||
echo "intent_id: $intent_id"
|
echo "intent_id: $intent_id"
|
||||||
}
|
}
|
||||||
@@ -119,7 +144,7 @@ clear_intent() {
|
|||||||
echo -e "${RED}No group_id found.${NC}"
|
echo -e "${RED}No group_id found.${NC}"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
mosquitto_pub -h "$BROKER" -p "$PORT" -t "$topic" -q 1 --retain --null-message
|
mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$topic" -q 1 --retain --null-message
|
||||||
echo -e "${GREEN}Retained intent cleared from broker${NC}"
|
echo -e "${GREEN}Retained intent cleared from broker${NC}"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,7 +192,7 @@ subscribe_power_state() {
|
|||||||
echo -e "${YELLOW}Subscribing to: $topic${NC}"
|
echo -e "${YELLOW}Subscribing to: $topic${NC}"
|
||||||
echo "(Ctrl-C to stop)"
|
echo "(Ctrl-C to stop)"
|
||||||
echo ""
|
echo ""
|
||||||
mosquitto_sub -h "$BROKER" -p "$PORT" -t "$topic" | \
|
mosquitto_sub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$topic" | \
|
||||||
python3 -c "
|
python3 -c "
|
||||||
import sys, json
|
import sys, json
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
@@ -216,7 +241,7 @@ while true; do
|
|||||||
echo -e "${RED}No group_id.${NC}"
|
echo -e "${RED}No group_id.${NC}"
|
||||||
else
|
else
|
||||||
TOPIC="$(group_topic)"
|
TOPIC="$(group_topic)"
|
||||||
mosquitto_pub -h "$BROKER" -p "$PORT" -t "$TOPIC" -q 1 --retain \
|
mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -t "$TOPIC" -q 1 --retain \
|
||||||
-m '{"schema_version":"1.0","desired_state":"on"}'
|
-m '{"schema_version":"1.0","desired_state":"on"}'
|
||||||
echo -e "${YELLOW}⚠ Malformed intent published - client must reject with 'missing required field' in log${NC}"
|
echo -e "${YELLOW}⚠ Malformed intent published - client must reject with 'missing required field' in log${NC}"
|
||||||
fi
|
fi
|
||||||
|
|||||||
244
scripts/test-reboot-command.sh
Executable file
244
scripts/test-reboot-command.sh
Executable file
@@ -0,0 +1,244 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Safe end-to-end command lifecycle canary for reboot/shutdown contract v1.
|
||||||
|
# Verifies ack flow: accepted -> execution_started -> completed/failed.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||||
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
RED='\033[0;31m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
ENV_FILE="$PROJECT_ROOT/.env"
|
||||||
|
if [[ -f "$ENV_FILE" ]]; then
|
||||||
|
while IFS='=' read -r key value; do
|
||||||
|
key="${key//[$'\t\r\n']}"
|
||||||
|
key="$(echo "$key" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
|
||||||
|
[[ -z "$key" ]] && continue
|
||||||
|
[[ "$key" =~ ^# ]] && continue
|
||||||
|
[[ "$key" =~ ^[A-Z_][A-Z0-9_]*$ ]] || continue
|
||||||
|
value="${value%%#*}"
|
||||||
|
value="${value//[$'\t\r\n']}"
|
||||||
|
value="$(echo "$value" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
|
||||||
|
export "$key=$value"
|
||||||
|
done < "$ENV_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
BROKER="${MQTT_BROKER:-localhost}"
|
||||||
|
PORT="${MQTT_PORT:-1883}"
|
||||||
|
MQTT_USERNAME="${MQTT_USERNAME:-}"
|
||||||
|
MQTT_PASSWORD="${MQTT_PASSWORD:-}"
|
||||||
|
MQTT_TLS_ENABLED="${MQTT_TLS_ENABLED:-0}"
|
||||||
|
MQTT_TLS_CA_CERT="${MQTT_TLS_CA_CERT:-}"
|
||||||
|
MQTT_TLS_CERT="${MQTT_TLS_CERT:-}"
|
||||||
|
MQTT_TLS_KEY="${MQTT_TLS_KEY:-}"
|
||||||
|
MQTT_TLS_INSECURE="${MQTT_TLS_INSECURE:-0}"
|
||||||
|
CLIENT_UUID_FILE="$PROJECT_ROOT/src/config/client_uuid.txt"
|
||||||
|
LAST_COMMAND_STATE_FILE="$PROJECT_ROOT/src/config/last_command_state.json"
|
||||||
|
|
||||||
|
if [[ ! -f "$CLIENT_UUID_FILE" ]]; then
|
||||||
|
echo -e "${RED}client UUID file missing: $CLIENT_UUID_FILE${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v mosquitto_pub >/dev/null 2>&1 || ! command -v mosquitto_sub >/dev/null 2>&1; then
|
||||||
|
echo -e "${RED}mosquitto_pub/sub not found. Install mosquitto-clients.${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CLIENT_UUID="$(tr -d '[:space:]' < "$CLIENT_UUID_FILE")"
|
||||||
|
COMMAND_TOPIC="infoscreen/${CLIENT_UUID}/commands"
|
||||||
|
COMMAND_TOPIC_ALIAS="infoscreen/${CLIENT_UUID}/command"
|
||||||
|
ACK_TOPIC="infoscreen/${CLIENT_UUID}/commands/ack"
|
||||||
|
ACK_TOPIC_ALIAS="infoscreen/${CLIENT_UUID}/command/ack"
|
||||||
|
|
||||||
|
MQTT_AUTH_ARGS=()
|
||||||
|
MQTT_TLS_ARGS=()
|
||||||
|
|
||||||
|
if [[ -n "$MQTT_USERNAME" ]]; then
|
||||||
|
MQTT_AUTH_ARGS+=( -u "$MQTT_USERNAME" )
|
||||||
|
fi
|
||||||
|
if [[ -n "$MQTT_PASSWORD" ]]; then
|
||||||
|
MQTT_AUTH_ARGS+=( -P "$MQTT_PASSWORD" )
|
||||||
|
fi
|
||||||
|
if [[ "$MQTT_TLS_ENABLED" == "1" || "$MQTT_TLS_ENABLED" == "true" || "$MQTT_TLS_ENABLED" == "yes" ]]; then
|
||||||
|
[[ -n "$MQTT_TLS_CA_CERT" ]] && MQTT_TLS_ARGS+=( --cafile "$MQTT_TLS_CA_CERT" )
|
||||||
|
[[ -n "$MQTT_TLS_CERT" ]] && MQTT_TLS_ARGS+=( --cert "$MQTT_TLS_CERT" )
|
||||||
|
[[ -n "$MQTT_TLS_KEY" ]] && MQTT_TLS_ARGS+=( --key "$MQTT_TLS_KEY" )
|
||||||
|
if [[ "$MQTT_TLS_INSECURE" == "1" || "$MQTT_TLS_INSECURE" == "true" || "$MQTT_TLS_INSECURE" == "yes" ]]; then
|
||||||
|
MQTT_TLS_ARGS+=( --insecure )
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
ACTION="${1:-reboot_host}"
|
||||||
|
MODE="${2:-success}" # success | failed
|
||||||
|
TOPIC_MODE="${3:-canonical}" # canonical | alias
|
||||||
|
WAIT_SEC="${4:-25}"
|
||||||
|
|
||||||
|
if [[ "$ACTION" != "reboot_host" && "$ACTION" != "shutdown_host" ]]; then
|
||||||
|
echo -e "${RED}invalid action '$ACTION' (expected reboot_host|shutdown_host)${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [[ "$MODE" != "success" && "$MODE" != "failed" ]]; then
|
||||||
|
echo -e "${RED}invalid mode '$MODE' (expected success|failed)${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [[ "$TOPIC_MODE" != "canonical" && "$TOPIC_MODE" != "alias" ]]; then
|
||||||
|
echo -e "${RED}invalid topic mode '$TOPIC_MODE' (expected canonical|alias)${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! [[ "$WAIT_SEC" =~ ^[0-9]+$ ]] || [[ "$WAIT_SEC" -lt 1 ]]; then
|
||||||
|
echo -e "${RED}invalid wait seconds '$WAIT_SEC' (expected positive integer)${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$TOPIC_MODE" == "alias" ]]; then
|
||||||
|
COMMAND_TOPIC="$COMMAND_TOPIC_ALIAS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
COMMAND_ID="$(python3 - <<'PY'
|
||||||
|
import uuid
|
||||||
|
print(uuid.uuid4())
|
||||||
|
PY
|
||||||
|
)"
|
||||||
|
ISSUED_AT="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
||||||
|
EXPIRES_EPOCH="$(( $(date +%s) + 240 ))"
|
||||||
|
EXPIRES_AT="$(date -u -d "@$EXPIRES_EPOCH" +"%Y-%m-%dT%H:%M:%SZ")"
|
||||||
|
|
||||||
|
PAYLOAD="$(cat <<EOF
|
||||||
|
{
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "$COMMAND_ID",
|
||||||
|
"client_uuid": "$CLIENT_UUID",
|
||||||
|
"action": "$ACTION",
|
||||||
|
"issued_at": "$ISSUED_AT",
|
||||||
|
"expires_at": "$EXPIRES_AT",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "canary_test"
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
|
||||||
|
TMP_ACK_LOG="$(mktemp)"
|
||||||
|
cleanup() {
|
||||||
|
[[ -n "${SUB_PID_1:-}" ]] && kill "$SUB_PID_1" >/dev/null 2>&1 || true
|
||||||
|
[[ -n "${SUB_PID_2:-}" ]] && kill "$SUB_PID_2" >/dev/null 2>&1 || true
|
||||||
|
rm -f "$TMP_ACK_LOG"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
echo -e "${BLUE}================================================${NC}"
|
||||||
|
echo -e "${BLUE}Command Lifecycle Canary${NC}"
|
||||||
|
echo -e "${BLUE}================================================${NC}"
|
||||||
|
echo " Broker : $BROKER:$PORT"
|
||||||
|
echo " Client UUID : $CLIENT_UUID"
|
||||||
|
echo " Command ID : $COMMAND_ID"
|
||||||
|
echo " Action : $ACTION"
|
||||||
|
echo " Mode : $MODE"
|
||||||
|
echo " Cmd Topic : $COMMAND_TOPIC"
|
||||||
|
echo " Ack Topics : $ACK_TOPIC , $ACK_TOPIC_ALIAS"
|
||||||
|
echo ""
|
||||||
|
echo -e "${YELLOW}IMPORTANT${NC}: to avoid real reboot/shutdown, run simclient with"
|
||||||
|
echo " COMMAND_HELPER_PATH=$PROJECT_ROOT/scripts/mock-command-helper.sh"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Subscribe first to avoid missing retained/non-retained race windows.
|
||||||
|
mosquitto_sub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -q 1 -v -t "$ACK_TOPIC" >> "$TMP_ACK_LOG" &
|
||||||
|
SUB_PID_1=$!
|
||||||
|
mosquitto_sub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -q 1 -v -t "$ACK_TOPIC_ALIAS" >> "$TMP_ACK_LOG" &
|
||||||
|
SUB_PID_2=$!
|
||||||
|
sleep 0.5
|
||||||
|
|
||||||
|
if [[ "$MODE" == "failed" ]]; then
|
||||||
|
echo -e "${YELLOW}If simclient was started with MOCK_COMMAND_HELPER_FORCE_FAIL=1, expected terminal status is failed.${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -e "${YELLOW}Publishing command payload...${NC}"
|
||||||
|
mosquitto_pub -h "$BROKER" -p "$PORT" "${MQTT_AUTH_ARGS[@]}" "${MQTT_TLS_ARGS[@]}" -q 1 -t "$COMMAND_TOPIC" -m "$PAYLOAD"
|
||||||
|
|
||||||
|
EXPECTED_TERMINAL="completed"
|
||||||
|
if [[ "$MODE" == "failed" ]]; then
|
||||||
|
EXPECTED_TERMINAL="failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
EXPECT_RECOVERY_COMPLETION=0
|
||||||
|
if [[ "$ACTION" == "reboot_host" && "$MODE" == "success" ]]; then
|
||||||
|
EXPECTED_TERMINAL=""
|
||||||
|
EXPECT_RECOVERY_COMPLETION=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
DEADLINE=$(( $(date +%s) + WAIT_SEC ))
|
||||||
|
SEEN_ACCEPTED=0
|
||||||
|
SEEN_STARTED=0
|
||||||
|
SEEN_TERMINAL=0
|
||||||
|
|
||||||
|
while [[ $(date +%s) -lt $DEADLINE ]]; do
|
||||||
|
if grep -q '"command_id"' "$TMP_ACK_LOG" 2>/dev/null; then
|
||||||
|
if grep -q "\"command_id\": \"$COMMAND_ID\"" "$TMP_ACK_LOG"; then
|
||||||
|
grep -q '"status": "accepted"' "$TMP_ACK_LOG" && SEEN_ACCEPTED=1 || true
|
||||||
|
grep -q '"status": "execution_started"' "$TMP_ACK_LOG" && SEEN_STARTED=1 || true
|
||||||
|
if [[ -n "$EXPECTED_TERMINAL" ]]; then
|
||||||
|
grep -q "\"status\": \"$EXPECTED_TERMINAL\"" "$TMP_ACK_LOG" && SEEN_TERMINAL=1 || true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $SEEN_ACCEPTED -eq 1 && $SEEN_STARTED -eq 1 ]]; then
|
||||||
|
if [[ -z "$EXPECTED_TERMINAL" || $SEEN_TERMINAL -eq 1 ]]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}Ack stream (filtered by command_id):${NC}"
|
||||||
|
python3 - <<'PY' "$TMP_ACK_LOG" "$COMMAND_ID"
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
path, command_id = sys.argv[1], sys.argv[2]
|
||||||
|
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split(" ", 1)
|
||||||
|
payload = parts[1] if len(parts) == 2 else parts[0]
|
||||||
|
try:
|
||||||
|
obj = json.loads(payload)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if obj.get("command_id") == command_id:
|
||||||
|
print(json.dumps(obj, indent=2))
|
||||||
|
PY
|
||||||
|
|
||||||
|
if [[ $SEEN_ACCEPTED -eq 1 && $SEEN_STARTED -eq 1 ]]; then
|
||||||
|
if [[ -z "$EXPECTED_TERMINAL" ]]; then
|
||||||
|
echo -e "${GREEN}PASS${NC}: observed accepted -> execution_started"
|
||||||
|
echo -e "${YELLOW}NOTE${NC}: completed for reboot_host is expected only after client reconnect/recovery."
|
||||||
|
elif [[ $SEEN_TERMINAL -eq 1 ]]; then
|
||||||
|
echo -e "${GREEN}PASS${NC}: observed accepted -> execution_started -> $EXPECTED_TERMINAL"
|
||||||
|
else
|
||||||
|
echo -e "${RED}FAIL${NC}: missing expected terminal state $EXPECTED_TERMINAL for command_id=$COMMAND_ID"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo -e "${RED}FAIL${NC}: missing expected lifecycle states for command_id=$COMMAND_ID"
|
||||||
|
if [[ -n "$EXPECTED_TERMINAL" ]]; then
|
||||||
|
echo " observed: accepted=$SEEN_ACCEPTED execution_started=$SEEN_STARTED terminal($EXPECTED_TERMINAL)=$SEEN_TERMINAL"
|
||||||
|
else
|
||||||
|
echo " observed: accepted=$SEEN_ACCEPTED execution_started=$SEEN_STARTED"
|
||||||
|
fi
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$LAST_COMMAND_STATE_FILE" ]]; then
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}Last command state:${NC}"
|
||||||
|
python3 -m json.tool "$LAST_COMMAND_STATE_FILE" || cat "$LAST_COMMAND_STATE_FILE"
|
||||||
|
fi
|
||||||
@@ -8,6 +8,13 @@ VERSION=latest
|
|||||||
# MQTT Broker
|
# MQTT Broker
|
||||||
MQTT_BROKER=192.168.1.100
|
MQTT_BROKER=192.168.1.100
|
||||||
MQTT_PORT=1883
|
MQTT_PORT=1883
|
||||||
|
MQTT_USERNAME=infoscreen-client-<client-uuid-prefix>
|
||||||
|
MQTT_PASSWORD=<set-per-device-20-char-random-password>
|
||||||
|
MQTT_TLS_ENABLED=0
|
||||||
|
# MQTT_TLS_CA_CERT=/etc/infoscreen/mqtt/ca.crt
|
||||||
|
# MQTT_TLS_CERT=/etc/infoscreen/mqtt/client.crt
|
||||||
|
# MQTT_TLS_KEY=/etc/infoscreen/mqtt/client.key
|
||||||
|
# MQTT_TLS_INSECURE=0
|
||||||
|
|
||||||
# Timing (production values)
|
# Timing (production values)
|
||||||
HEARTBEAT_INTERVAL=60
|
HEARTBEAT_INTERVAL=60
|
||||||
|
|||||||
@@ -9,6 +9,13 @@ LOG_LEVEL=DEBUG
|
|||||||
# MQTT Broker Configuration
|
# MQTT Broker Configuration
|
||||||
MQTT_BROKER=192.168.1.100 # Change to your MQTT server IP
|
MQTT_BROKER=192.168.1.100 # Change to your MQTT server IP
|
||||||
MQTT_PORT=1883
|
MQTT_PORT=1883
|
||||||
|
MQTT_USERNAME=infoscreen-client-<client-uuid-prefix>
|
||||||
|
MQTT_PASSWORD=<set-per-device-20-char-random-password>
|
||||||
|
MQTT_TLS_ENABLED=0
|
||||||
|
# MQTT_TLS_CA_CERT=/etc/infoscreen/mqtt/ca.crt
|
||||||
|
# MQTT_TLS_CERT=/etc/infoscreen/mqtt/client.crt
|
||||||
|
# MQTT_TLS_KEY=/etc/infoscreen/mqtt/client.key
|
||||||
|
# MQTT_TLS_INSECURE=0
|
||||||
|
|
||||||
# Timing Configuration (shorter intervals for development)
|
# Timing Configuration (shorter intervals for development)
|
||||||
HEARTBEAT_INTERVAL=10 # Heartbeat frequency in seconds
|
HEARTBEAT_INTERVAL=10 # Heartbeat frequency in seconds
|
||||||
|
|||||||
@@ -22,23 +22,52 @@ Primary runtime flow:
|
|||||||
## Key Files
|
## Key Files
|
||||||
|
|
||||||
- `display_manager.py`: display lifecycle, HDMI-CEC, screenshots, local fallback logic.
|
- `display_manager.py`: display lifecycle, HDMI-CEC, screenshots, local fallback logic.
|
||||||
- `simclient.py`: MQTT callbacks, event persistence, dashboard publishing, power-intent validation.
|
- `simclient.py`: MQTT callbacks, event persistence, dashboard publishing, power-intent validation, command intake.
|
||||||
- `current_event.json`: active event state consumed by the display manager.
|
- `current_event.json`: active event state consumed by the display manager.
|
||||||
- `current_process_health.json`: local health bridge for monitoring.
|
- `current_process_health.json`: local health bridge for monitoring.
|
||||||
- `power_intent_state.json`: latest validated power intent from MQTT.
|
- `power_intent_state.json`: latest validated power intent from MQTT.
|
||||||
- `power_state.json`: latest applied power action telemetry.
|
- `power_state.json`: latest applied power action telemetry.
|
||||||
- `screenshots/meta.json`: screenshot metadata used by the dashboard path.
|
- `screenshots/meta.json`: screenshot metadata used by the dashboard path.
|
||||||
|
- `../scripts/start-simclient.sh`: launcher for `simclient.py` (used by the systemd unit).
|
||||||
|
- `../scripts/start-display-manager.sh`: launcher for `display_manager.py`.
|
||||||
|
- `../scripts/infoscreen-simclient.service`: systemd unit for `simclient.py`.
|
||||||
|
- `../scripts/infoscreen-display.service`: systemd unit for `display_manager.py`.
|
||||||
|
- `../scripts/infoscreen-notify-failure@.service`: systemd template unit; fires on `OnFailure=`.
|
||||||
|
- `../scripts/infoscreen-notify-failure.sh`: publishes `service_failed` MQTT alert when a unit gives up.
|
||||||
|
|
||||||
## Developer Workflow
|
## Developer Workflow
|
||||||
|
|
||||||
Typical local workflow:
|
On deployed devices, both processes are managed by systemd:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Start / stop / restart
|
||||||
|
sudo systemctl start infoscreen-simclient infoscreen-display
|
||||||
|
sudo systemctl restart infoscreen-simclient infoscreen-display
|
||||||
|
|
||||||
|
# Follow logs
|
||||||
|
journalctl -u infoscreen-simclient -u infoscreen-display -f
|
||||||
|
```
|
||||||
|
|
||||||
|
First-time systemd setup:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo cp scripts/infoscreen-simclient.service /etc/systemd/system/
|
||||||
|
sudo cp scripts/infoscreen-display.service /etc/systemd/system/
|
||||||
|
sudo cp scripts/infoscreen-notify-failure@.service /etc/systemd/system/
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable infoscreen-simclient infoscreen-display
|
||||||
|
```
|
||||||
|
|
||||||
|
Or run `src/pi-setup.sh` which includes the above as step 14.
|
||||||
|
|
||||||
|
For local development without systemd:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd ~/infoscreen-dev
|
cd ~/infoscreen-dev
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
|
|
||||||
# Terminal 1
|
# Terminal 1
|
||||||
./scripts/start-dev.sh
|
./scripts/start-simclient.sh
|
||||||
|
|
||||||
# Terminal 2
|
# Terminal 2
|
||||||
./scripts/start-display-manager.sh
|
./scripts/start-display-manager.sh
|
||||||
@@ -51,6 +80,7 @@ Useful helpers:
|
|||||||
- `./scripts/test-mqtt.sh`
|
- `./scripts/test-mqtt.sh`
|
||||||
- `./scripts/test-screenshot.sh`
|
- `./scripts/test-screenshot.sh`
|
||||||
- `./scripts/test-power-intent.sh`
|
- `./scripts/test-power-intent.sh`
|
||||||
|
- `./scripts/test-progress-bars.sh`
|
||||||
|
|
||||||
## MQTT Topics
|
## MQTT Topics
|
||||||
|
|
||||||
@@ -59,8 +89,11 @@ Useful helpers:
|
|||||||
- `infoscreen/discovery`
|
- `infoscreen/discovery`
|
||||||
- `infoscreen/{client_id}/heartbeat`
|
- `infoscreen/{client_id}/heartbeat`
|
||||||
- `infoscreen/{client_id}/dashboard`
|
- `infoscreen/{client_id}/dashboard`
|
||||||
- `infoscreen/{client_id}/health`
|
- `infoscreen/{client_id}/health` — includes `broker_connection` block with `reconnect_count`, `last_disconnect_at`
|
||||||
- `infoscreen/{client_id}/power/state`
|
- `infoscreen/{client_id}/power/state`
|
||||||
|
- `infoscreen/{client_id}/commands/ack` — command acknowledgement (states: `accepted`, `rejected`, `execution_started`, `completed`, `failed`)
|
||||||
|
- `infoscreen/{client_id}/command/ack` — legacy ack topic (also published for compatibility)
|
||||||
|
- `infoscreen/{client_id}/service_failed` — retained alert published by `infoscreen-notify-failure.sh` when systemd gives up restarting a unit
|
||||||
|
|
||||||
### Server → Client
|
### Server → Client
|
||||||
|
|
||||||
@@ -68,6 +101,7 @@ Useful helpers:
|
|||||||
- `infoscreen/{client_id}/group_id`
|
- `infoscreen/{client_id}/group_id`
|
||||||
- `infoscreen/events/{group_id}`
|
- `infoscreen/events/{group_id}`
|
||||||
- `infoscreen/groups/{group_id}/power/intent`
|
- `infoscreen/groups/{group_id}/power/intent`
|
||||||
|
- `infoscreen/{client_id}/commands` — remote command intake (`reboot`, `shutdown`)
|
||||||
|
|
||||||
## Event and Display Notes
|
## Event and Display Notes
|
||||||
|
|
||||||
@@ -118,6 +152,8 @@ cat ~/infoscreen-dev/src/screenshots/meta.json
|
|||||||
|
|
||||||
- `ENV=development` disables HDMI-CEC in the display manager.
|
- `ENV=development` disables HDMI-CEC in the display manager.
|
||||||
- `POWER_CONTROL_MODE` controls local vs hybrid vs mqtt power behavior.
|
- `POWER_CONTROL_MODE` controls local vs hybrid vs mqtt power behavior.
|
||||||
|
- `COMMAND_HELPER_PATH` points to the shell script that executes privileged commands (reboot/shutdown). Use `mock-command-helper.sh` for local testing.
|
||||||
|
- `COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE=1` makes a mock reboot complete immediately instead of waiting for process restart. Only works when the helper basename is `mock-command-helper.sh`.
|
||||||
- File download host rewriting is handled in `simclient.py` using `FILE_SERVER_*` settings.
|
- File download host rewriting is handled in `simclient.py` using `FILE_SERVER_*` settings.
|
||||||
|
|
||||||
## Related Documents
|
## Related Documents
|
||||||
|
|||||||
@@ -125,13 +125,54 @@ EOF
|
|||||||
chmod +x "$PROJECT_DIR/scripts/start-dev.sh"
|
chmod +x "$PROJECT_DIR/scripts/start-dev.sh"
|
||||||
log_ok "start-dev.sh created"
|
log_ok "start-dev.sh created"
|
||||||
|
|
||||||
# 13. SSH enable (for remote dev)
|
# 13. Install command helper + sudoers rule for reboot/shutdown command execution
|
||||||
|
log_step "Installing command helper and sudoers policy..."
|
||||||
|
HELPER_SRC="$PROJECT_DIR/scripts/infoscreen-cmd-helper.sh"
|
||||||
|
HELPER_DST="/usr/local/bin/infoscreen-cmd-helper.sh"
|
||||||
|
SUDOERS_FILE="/etc/sudoers.d/infoscreen-command-helper"
|
||||||
|
|
||||||
|
if [ -f "$HELPER_SRC" ]; then
|
||||||
|
sudo install -m 0755 "$HELPER_SRC" "$HELPER_DST"
|
||||||
|
echo "$USER ALL=(ALL) NOPASSWD: $HELPER_DST" | sudo tee "$SUDOERS_FILE" >/dev/null
|
||||||
|
sudo chmod 0440 "$SUDOERS_FILE"
|
||||||
|
sudo visudo -cf "$SUDOERS_FILE" >/dev/null
|
||||||
|
log_ok "Command helper installed at $HELPER_DST and sudoers rule validated"
|
||||||
|
else
|
||||||
|
log_warn "Command helper source not found: $HELPER_SRC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 14. Systemd service units (simclient + display manager)
|
||||||
|
log_step "Installing systemd service units..."
|
||||||
|
SERVICES_SRC="$PROJECT_DIR/scripts"
|
||||||
|
SYSTEMD_DIR="/etc/systemd/system"
|
||||||
|
|
||||||
|
for unit in infoscreen-simclient.service infoscreen-display.service "infoscreen-notify-failure@.service"; do
|
||||||
|
if [ -f "$SERVICES_SRC/$unit" ]; then
|
||||||
|
sudo cp "$SERVICES_SRC/$unit" "$SYSTEMD_DIR/$unit"
|
||||||
|
log_ok "Installed $SYSTEMD_DIR/$unit"
|
||||||
|
else
|
||||||
|
log_warn "Service unit not found: $SERVICES_SRC/$unit"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
|
||||||
|
for unit in infoscreen-simclient.service infoscreen-display.service; do
|
||||||
|
if [ -f "$SYSTEMD_DIR/$unit" ]; then
|
||||||
|
sudo systemctl enable "$unit" >/dev/null 2>&1 || true
|
||||||
|
log_ok "Enabled: $unit"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
log_ok "Systemd units installed. Start with: sudo systemctl start infoscreen-simclient infoscreen-display"
|
||||||
|
|
||||||
|
# 15. SSH enable (for remote dev)
|
||||||
log_step "Ensuring SSH service enabled..."
|
log_step "Ensuring SSH service enabled..."
|
||||||
sudo systemctl enable ssh >/dev/null 2>&1 || true
|
sudo systemctl enable ssh >/dev/null 2>&1 || true
|
||||||
sudo systemctl start ssh >/dev/null 2>&1 || true
|
sudo systemctl start ssh >/dev/null 2>&1 || true
|
||||||
log_ok "SSH service active"
|
log_ok "SSH service active"
|
||||||
|
|
||||||
# 14. Summary
|
# 16. Summary
|
||||||
echo ""
|
echo ""
|
||||||
echo -e "${GREEN}🎉 Setup complete!${NC}"
|
echo -e "${GREEN}🎉 Setup complete!${NC}"
|
||||||
echo "Project: $PROJECT_DIR"
|
echo "Project: $PROJECT_DIR"
|
||||||
|
|||||||
636
src/simclient.py
636
src/simclient.py
@@ -7,18 +7,39 @@ import json
|
|||||||
import socket
|
import socket
|
||||||
import hashlib
|
import hashlib
|
||||||
import paho.mqtt.client as mqtt
|
import paho.mqtt.client as mqtt
|
||||||
|
import ssl
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import re
|
import re
|
||||||
import platform
|
import platform
|
||||||
import logging
|
import logging
|
||||||
|
import subprocess
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import requests
|
import requests
|
||||||
import base64
|
import base64
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone, timedelta
|
||||||
import threading
|
import threading
|
||||||
from urllib.parse import urlsplit, urlunsplit, unquote
|
from urllib.parse import urlsplit, urlunsplit, unquote
|
||||||
|
|
||||||
|
|
||||||
|
def _sd_notify(msg: str) -> None:
|
||||||
|
"""Send a sd_notify message to systemd via NOTIFY_SOCKET.
|
||||||
|
|
||||||
|
Uses raw socket so no extra package (systemd-python) is required.
|
||||||
|
Safe to call when not running under systemd (NOTIFY_SOCKET unset).
|
||||||
|
"""
|
||||||
|
sock_path = os.environ.get("NOTIFY_SOCKET")
|
||||||
|
if not sock_path:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
addr = sock_path.lstrip("@") # abstract namespace sockets start with '@'
|
||||||
|
with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as s:
|
||||||
|
s.connect(addr)
|
||||||
|
s.sendall(msg.encode())
|
||||||
|
except Exception:
|
||||||
|
pass # never crash the app over a watchdog notification
|
||||||
|
|
||||||
|
|
||||||
# ENV laden - support both container and native development
|
# ENV laden - support both container and native development
|
||||||
env_paths = [
|
env_paths = [
|
||||||
"/workspace/simclient/.env", # Container path
|
"/workspace/simclient/.env", # Container path
|
||||||
@@ -96,6 +117,18 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG" if ENV == "development" else "INFO")
|
|||||||
MQTT_BROKER = _env_host("MQTT_BROKER", "localhost" if ENV == "development" else "mqtt")
|
MQTT_BROKER = _env_host("MQTT_BROKER", "localhost" if ENV == "development" else "mqtt")
|
||||||
MQTT_PORT = _env_int("MQTT_PORT", 1883)
|
MQTT_PORT = _env_int("MQTT_PORT", 1883)
|
||||||
DEBUG_MODE = _env_bool("DEBUG_MODE", ENV == "development")
|
DEBUG_MODE = _env_bool("DEBUG_MODE", ENV == "development")
|
||||||
|
MQTT_USER = _env_str_clean("MQTT_USER", "")
|
||||||
|
MQTT_PASSWORD_BROKER = _env_str_clean("MQTT_PASSWORD_BROKER", "")
|
||||||
|
MQTT_USERNAME = _env_str_clean("MQTT_USERNAME", "")
|
||||||
|
MQTT_PASSWORD = _env_str_clean("MQTT_PASSWORD", "")
|
||||||
|
MQTT_TLS_CA_CERT = _env_str_clean("MQTT_TLS_CA_CERT", "")
|
||||||
|
MQTT_TLS_CERT = _env_str_clean("MQTT_TLS_CERT", "")
|
||||||
|
MQTT_TLS_KEY = _env_str_clean("MQTT_TLS_KEY", "")
|
||||||
|
MQTT_TLS_INSECURE = _env_bool("MQTT_TLS_INSECURE", False)
|
||||||
|
MQTT_TLS_ENABLED = _env_bool(
|
||||||
|
"MQTT_TLS_ENABLED",
|
||||||
|
bool(MQTT_TLS_CA_CERT or MQTT_TLS_CERT or MQTT_TLS_KEY),
|
||||||
|
)
|
||||||
MQTT_BROKER_FALLBACKS = []
|
MQTT_BROKER_FALLBACKS = []
|
||||||
_fallbacks_raw = os.getenv("MQTT_BROKER_FALLBACKS", "")
|
_fallbacks_raw = os.getenv("MQTT_BROKER_FALLBACKS", "")
|
||||||
if _fallbacks_raw:
|
if _fallbacks_raw:
|
||||||
@@ -150,6 +183,48 @@ SCREENSHOT_META_FILE = os.path.join(SCREENSHOT_DIR, "meta.json")
|
|||||||
POWER_CONTROL_MODE = os.getenv("POWER_CONTROL_MODE", "local").strip().lower()
|
POWER_CONTROL_MODE = os.getenv("POWER_CONTROL_MODE", "local").strip().lower()
|
||||||
POWER_INTENT_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_intent_state.json")
|
POWER_INTENT_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_intent_state.json")
|
||||||
POWER_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_state.json")
|
POWER_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_state.json")
|
||||||
|
COMMAND_STATE_DIR = os.path.join(os.path.dirname(__file__), "config")
|
||||||
|
PROCESSED_COMMANDS_FILE = os.path.join(COMMAND_STATE_DIR, "processed_commands.json")
|
||||||
|
LAST_COMMAND_STATE_FILE = os.path.join(COMMAND_STATE_DIR, "last_command_state.json")
|
||||||
|
COMMAND_HELPER_PATH = os.getenv("COMMAND_HELPER_PATH", "/usr/local/bin/infoscreen-cmd-helper.sh")
|
||||||
|
COMMAND_EXEC_TIMEOUT_SEC = _env_int("COMMAND_EXEC_TIMEOUT_SEC", 15)
|
||||||
|
COMMAND_DEDUPE_TTL_HOURS = _env_int("COMMAND_DEDUPE_TTL_HOURS", 24)
|
||||||
|
COMMAND_DEDUPE_MAX_ENTRIES = _env_int("COMMAND_DEDUPE_MAX_ENTRIES", 5000)
|
||||||
|
COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE = _env_bool("COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", False)
|
||||||
|
|
||||||
|
NIL_COMMAND_ID = "00000000-0000-0000-0000-000000000000"
|
||||||
|
COMMAND_ACTIONS = ("reboot_host", "shutdown_host")
|
||||||
|
ACK_STATUSES = ("accepted", "execution_started", "completed", "failed")
|
||||||
|
COMMAND_ERROR_CODES = {
|
||||||
|
"invalid_schema",
|
||||||
|
"missing_field",
|
||||||
|
"stale_command",
|
||||||
|
"duplicate_command",
|
||||||
|
"permission_denied_local",
|
||||||
|
"execution_timeout",
|
||||||
|
"execution_failed",
|
||||||
|
"broker_unavailable",
|
||||||
|
"internal_error",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def command_requires_recovery_completion(action):
|
||||||
|
return action == "reboot_host"
|
||||||
|
|
||||||
|
|
||||||
|
def command_mock_reboot_immediate_complete_enabled(action):
|
||||||
|
if action != "reboot_host" or not COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE:
|
||||||
|
return False
|
||||||
|
|
||||||
|
helper_basename = os.path.basename((COMMAND_HELPER_PATH or "").strip())
|
||||||
|
if helper_basename == "mock-command-helper.sh":
|
||||||
|
return True
|
||||||
|
|
||||||
|
logging.warning(
|
||||||
|
"Ignoring COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE because helper is not mock: %s",
|
||||||
|
COMMAND_HELPER_PATH,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
discovered = False
|
discovered = False
|
||||||
@@ -361,6 +436,246 @@ def write_power_intent_state(data):
|
|||||||
logging.error(f"Error writing power intent state: {e}")
|
logging.error(f"Error writing power intent state: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def _atomic_write_json(path, data):
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
|
tmp_path = path + ".tmp"
|
||||||
|
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
os.replace(tmp_path, path)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_json_or_default(path, default):
|
||||||
|
try:
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
except Exception:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_command_id(payload):
|
||||||
|
if isinstance(payload, dict):
|
||||||
|
value = payload.get("command_id")
|
||||||
|
if isinstance(value, str) and value.strip():
|
||||||
|
return value.strip()
|
||||||
|
return NIL_COMMAND_ID
|
||||||
|
|
||||||
|
|
||||||
|
def _as_uuid_str(value):
|
||||||
|
if not isinstance(value, str) or not value.strip():
|
||||||
|
raise ValueError("must be a non-empty string")
|
||||||
|
return str(uuid.UUID(value.strip()))
|
||||||
|
|
||||||
|
|
||||||
|
def _prune_processed_commands(commands):
|
||||||
|
if not isinstance(commands, dict):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(hours=max(1, COMMAND_DEDUPE_TTL_HOURS))
|
||||||
|
kept = {}
|
||||||
|
sortable = []
|
||||||
|
|
||||||
|
for command_id, entry in commands.items():
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
processed_at = entry.get("processed_at")
|
||||||
|
if not processed_at:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
processed_dt = _parse_utc_iso(processed_at)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if processed_dt < cutoff:
|
||||||
|
continue
|
||||||
|
kept[command_id] = entry
|
||||||
|
sortable.append((processed_dt, command_id))
|
||||||
|
|
||||||
|
max_entries = max(1, COMMAND_DEDUPE_MAX_ENTRIES)
|
||||||
|
if len(sortable) > max_entries:
|
||||||
|
sortable.sort(reverse=True)
|
||||||
|
keep_ids = {cid for _, cid in sortable[:max_entries]}
|
||||||
|
kept = {cid: entry for cid, entry in kept.items() if cid in keep_ids}
|
||||||
|
|
||||||
|
return kept
|
||||||
|
|
||||||
|
|
||||||
|
def load_processed_commands():
|
||||||
|
state = _read_json_or_default(PROCESSED_COMMANDS_FILE, {"commands": {}})
|
||||||
|
commands = state.get("commands") if isinstance(state, dict) else {}
|
||||||
|
commands = _prune_processed_commands(commands)
|
||||||
|
_atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": commands})
|
||||||
|
return commands
|
||||||
|
|
||||||
|
|
||||||
|
def persist_processed_commands(commands):
|
||||||
|
sanitized = _prune_processed_commands(commands)
|
||||||
|
_atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": sanitized})
|
||||||
|
|
||||||
|
|
||||||
|
def load_last_command_state():
|
||||||
|
state = _read_json_or_default(LAST_COMMAND_STATE_FILE, {})
|
||||||
|
if isinstance(state, dict):
|
||||||
|
return state
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def write_last_command_state(data):
|
||||||
|
_atomic_write_json(LAST_COMMAND_STATE_FILE, data)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_command_payload(payload, expected_client_uuid):
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
return False, None, "invalid_schema", "payload must be a JSON object"
|
||||||
|
|
||||||
|
required = {
|
||||||
|
"schema_version",
|
||||||
|
"command_id",
|
||||||
|
"client_uuid",
|
||||||
|
"action",
|
||||||
|
"issued_at",
|
||||||
|
"expires_at",
|
||||||
|
"requested_by",
|
||||||
|
"reason",
|
||||||
|
}
|
||||||
|
payload_keys = set(payload.keys())
|
||||||
|
missing = sorted(required - payload_keys)
|
||||||
|
if missing:
|
||||||
|
return False, None, "missing_field", f"missing required fields: {', '.join(missing)}"
|
||||||
|
extras = sorted(payload_keys - required)
|
||||||
|
if extras:
|
||||||
|
return False, None, "invalid_schema", f"unexpected fields: {', '.join(extras)}"
|
||||||
|
|
||||||
|
if payload.get("schema_version") != "1.0":
|
||||||
|
return False, None, "invalid_schema", "schema_version must be 1.0"
|
||||||
|
|
||||||
|
try:
|
||||||
|
command_id = _as_uuid_str(payload.get("command_id"))
|
||||||
|
except Exception:
|
||||||
|
return False, None, "invalid_schema", "command_id must be a valid UUID"
|
||||||
|
|
||||||
|
try:
|
||||||
|
client_uuid = _as_uuid_str(payload.get("client_uuid"))
|
||||||
|
except Exception:
|
||||||
|
return False, None, "invalid_schema", "client_uuid must be a valid UUID"
|
||||||
|
|
||||||
|
try:
|
||||||
|
expected_uuid = _as_uuid_str(expected_client_uuid)
|
||||||
|
except Exception:
|
||||||
|
expected_uuid = str(expected_client_uuid).strip()
|
||||||
|
|
||||||
|
if client_uuid != expected_uuid:
|
||||||
|
return False, None, "invalid_schema", "client_uuid does not match this client"
|
||||||
|
|
||||||
|
action = payload.get("action")
|
||||||
|
if action not in COMMAND_ACTIONS:
|
||||||
|
return False, None, "invalid_schema", f"action must be one of {COMMAND_ACTIONS}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
issued_at = _parse_utc_iso(payload.get("issued_at"))
|
||||||
|
expires_at = _parse_utc_iso(payload.get("expires_at"))
|
||||||
|
except Exception as e:
|
||||||
|
return False, None, "invalid_schema", f"invalid timestamp: {e}"
|
||||||
|
|
||||||
|
if expires_at <= issued_at:
|
||||||
|
return False, None, "invalid_schema", "expires_at must be later than issued_at"
|
||||||
|
if datetime.now(timezone.utc) > expires_at:
|
||||||
|
return False, None, "stale_command", "command expired"
|
||||||
|
|
||||||
|
requested_by = payload.get("requested_by")
|
||||||
|
if requested_by is not None:
|
||||||
|
if not isinstance(requested_by, int) or requested_by < 1:
|
||||||
|
return False, None, "invalid_schema", "requested_by must be integer >= 1 or null"
|
||||||
|
|
||||||
|
reason = payload.get("reason")
|
||||||
|
if reason is not None:
|
||||||
|
if not isinstance(reason, str):
|
||||||
|
return False, None, "invalid_schema", "reason must be string or null"
|
||||||
|
if len(reason) > 2000:
|
||||||
|
return False, None, "invalid_schema", "reason exceeds max length 2000"
|
||||||
|
|
||||||
|
normalized = {
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": command_id,
|
||||||
|
"client_uuid": client_uuid,
|
||||||
|
"action": action,
|
||||||
|
"issued_at": issued_at.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"expires_at": expires_at.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"requested_by": requested_by,
|
||||||
|
"reason": reason,
|
||||||
|
}
|
||||||
|
return True, normalized, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_uuid,
|
||||||
|
command_id,
|
||||||
|
status,
|
||||||
|
error_code=None,
|
||||||
|
error_message=None,
|
||||||
|
expires_at=None,
|
||||||
|
):
|
||||||
|
if status not in ACK_STATUSES:
|
||||||
|
raise ValueError(f"invalid ack status: {status}")
|
||||||
|
|
||||||
|
if status == "failed":
|
||||||
|
if not isinstance(error_code, str) or not error_code.strip():
|
||||||
|
error_code = "internal_error"
|
||||||
|
if not isinstance(error_message, str) or not error_message.strip():
|
||||||
|
error_message = "failed without diagnostic message"
|
||||||
|
else:
|
||||||
|
error_code = None
|
||||||
|
error_message = None
|
||||||
|
|
||||||
|
if isinstance(error_code, str):
|
||||||
|
error_code = error_code[:128]
|
||||||
|
if isinstance(error_message, str):
|
||||||
|
error_message = error_message[:4000]
|
||||||
|
|
||||||
|
ack_payload = {
|
||||||
|
"command_id": command_id,
|
||||||
|
"status": status,
|
||||||
|
"error_code": error_code,
|
||||||
|
"error_message": error_message,
|
||||||
|
}
|
||||||
|
encoded = json.dumps(ack_payload)
|
||||||
|
|
||||||
|
ack_topics = [
|
||||||
|
f"infoscreen/{client_uuid}/commands/ack",
|
||||||
|
f"infoscreen/{client_uuid}/command/ack",
|
||||||
|
]
|
||||||
|
retry_schedule = [0.5, 1, 2, 4, 5]
|
||||||
|
attempt = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
all_ok = True
|
||||||
|
for topic in ack_topics:
|
||||||
|
result = client.publish(topic, encoded, qos=1, retain=False)
|
||||||
|
if result.rc != mqtt.MQTT_ERR_SUCCESS:
|
||||||
|
all_ok = False
|
||||||
|
logging.warning(
|
||||||
|
"Command ack publish failed: topic=%s status=%s rc=%s",
|
||||||
|
topic,
|
||||||
|
status,
|
||||||
|
result.rc,
|
||||||
|
)
|
||||||
|
|
||||||
|
if all_ok:
|
||||||
|
logging.info("Command ack published: command_id=%s status=%s", command_id, status)
|
||||||
|
return True
|
||||||
|
|
||||||
|
if expires_at:
|
||||||
|
try:
|
||||||
|
if datetime.now(timezone.utc) >= _parse_utc_iso(expires_at):
|
||||||
|
logging.warning("Command ack retry stopped at expiry: command_id=%s", command_id)
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
delay = retry_schedule[min(attempt, len(retry_schedule) - 1)]
|
||||||
|
attempt += 1
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
|
||||||
def on_message(client, userdata, msg, properties=None):
|
def on_message(client, userdata, msg, properties=None):
|
||||||
global discovered
|
global discovered
|
||||||
logging.info(f"Received: {msg.topic} {msg.payload.decode()}")
|
logging.info(f"Received: {msg.topic} {msg.payload.decode()}")
|
||||||
@@ -482,6 +797,71 @@ def get_model():
|
|||||||
SOFTWARE_VERSION = "1.0.0" # Optional: Anpassen bei neuen Releases
|
SOFTWARE_VERSION = "1.0.0" # Optional: Anpassen bei neuen Releases
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_watchdog_enabled():
|
||||||
|
env_flag = os.getenv("WATCHDOG_ENABLED", "").strip().lower()
|
||||||
|
if env_flag in ("1", "true", "yes", "on"):
|
||||||
|
return True
|
||||||
|
if os.path.exists("/dev/watchdog"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_boot_source():
|
||||||
|
try:
|
||||||
|
with open("/proc/cmdline", "r", encoding="utf-8") as f:
|
||||||
|
cmdline = f.read().strip()
|
||||||
|
for token in cmdline.split():
|
||||||
|
if token.startswith("root="):
|
||||||
|
return token.split("=", 1)[1]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def configure_mqtt_security(client):
|
||||||
|
# Prefer broker-scoped auth vars when present, fallback to legacy vars.
|
||||||
|
auth_username = MQTT_USER or MQTT_USERNAME
|
||||||
|
auth_password = MQTT_PASSWORD_BROKER if MQTT_USER else MQTT_PASSWORD
|
||||||
|
|
||||||
|
configured = {
|
||||||
|
"username": bool(auth_username),
|
||||||
|
"tls": False,
|
||||||
|
"tls_insecure": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
if auth_username:
|
||||||
|
client.username_pw_set(auth_username, auth_password or None)
|
||||||
|
configured["username"] = True
|
||||||
|
logging.info("Configured MQTT username/password authentication")
|
||||||
|
|
||||||
|
if not MQTT_TLS_ENABLED:
|
||||||
|
return configured
|
||||||
|
|
||||||
|
tls_kwargs = {
|
||||||
|
"ca_certs": MQTT_TLS_CA_CERT or None,
|
||||||
|
"certfile": MQTT_TLS_CERT or None,
|
||||||
|
"keyfile": MQTT_TLS_KEY or None,
|
||||||
|
"tls_version": ssl.PROTOCOL_TLS_CLIENT,
|
||||||
|
}
|
||||||
|
client.tls_set(**tls_kwargs)
|
||||||
|
configured["tls"] = True
|
||||||
|
|
||||||
|
if MQTT_TLS_INSECURE:
|
||||||
|
client.tls_insecure_set(True)
|
||||||
|
configured["tls_insecure"] = True
|
||||||
|
logging.warning("MQTT TLS hostname verification disabled via MQTT_TLS_INSECURE")
|
||||||
|
else:
|
||||||
|
client.tls_insecure_set(False)
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
"Configured MQTT TLS: ca=%s client_cert=%s client_key=%s",
|
||||||
|
bool(MQTT_TLS_CA_CERT),
|
||||||
|
bool(MQTT_TLS_CERT),
|
||||||
|
bool(MQTT_TLS_KEY),
|
||||||
|
)
|
||||||
|
return configured
|
||||||
|
|
||||||
|
|
||||||
def send_discovery(client, client_id, hardware_token, ip_addr):
|
def send_discovery(client, client_id, hardware_token, ip_addr):
|
||||||
macs = get_mac_addresses()
|
macs = get_mac_addresses()
|
||||||
discovery_msg = {
|
discovery_msg = {
|
||||||
@@ -494,6 +874,12 @@ def send_discovery(client, client_id, hardware_token, ip_addr):
|
|||||||
"software_version": SOFTWARE_VERSION,
|
"software_version": SOFTWARE_VERSION,
|
||||||
"macs": macs,
|
"macs": macs,
|
||||||
"model": get_model(),
|
"model": get_model(),
|
||||||
|
"capabilities": {
|
||||||
|
"recovery_class": "software_only",
|
||||||
|
"watchdog_enabled": _detect_watchdog_enabled(),
|
||||||
|
"boot_source": _detect_boot_source(),
|
||||||
|
"command_schema_version": "1.0",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
client.publish("infoscreen/discovery", json.dumps(discovery_msg))
|
client.publish("infoscreen/discovery", json.dumps(discovery_msg))
|
||||||
logging.info(f"Discovery message sent: {discovery_msg}")
|
logging.info(f"Discovery message sent: {discovery_msg}")
|
||||||
@@ -766,7 +1152,7 @@ def delete_client_settings():
|
|||||||
logging.error(f"Error deleting client settings: {e}")
|
logging.error(f"Error deleting client settings: {e}")
|
||||||
|
|
||||||
|
|
||||||
def publish_health_message(client, client_id):
|
def publish_health_message(client, client_id, connection_state=None):
|
||||||
"""Publish health status to server via MQTT"""
|
"""Publish health status to server via MQTT"""
|
||||||
try:
|
try:
|
||||||
health = read_health_state()
|
health = read_health_state()
|
||||||
@@ -784,6 +1170,17 @@ def publish_health_message(client, client_id):
|
|||||||
"status": health.get("process_status")
|
"status": health.get("process_status")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if connection_state is not None:
|
||||||
|
last_disc = connection_state.get("last_disconnect")
|
||||||
|
payload["broker_connection"] = {
|
||||||
|
"broker_reachable": bool(connection_state.get("connected")),
|
||||||
|
"reconnect_count": connection_state.get("reconnect_count", 0),
|
||||||
|
"last_disconnect_at": (
|
||||||
|
datetime.fromtimestamp(last_disc, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
if last_disc else None
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
topic = f"infoscreen/{client_id}/health"
|
topic = f"infoscreen/{client_id}/health"
|
||||||
res = client.publish(topic, json.dumps(payload), qos=1)
|
res = client.publish(topic, json.dumps(payload), qos=1)
|
||||||
@@ -1007,6 +1404,10 @@ def main():
|
|||||||
power_intent_topic = None
|
power_intent_topic = None
|
||||||
last_power_intent_id = None
|
last_power_intent_id = None
|
||||||
last_power_issued_at = None
|
last_power_issued_at = None
|
||||||
|
command_topic = f"infoscreen/{client_id}/commands"
|
||||||
|
command_topic_alias = f"infoscreen/{client_id}/command"
|
||||||
|
processed_commands = load_processed_commands()
|
||||||
|
pending_recovery_command = load_last_command_state()
|
||||||
|
|
||||||
# paho-mqtt v2: opt into latest callback API to avoid deprecation warnings.
|
# paho-mqtt v2: opt into latest callback API to avoid deprecation warnings.
|
||||||
client_kwargs = {"protocol": mqtt.MQTTv311}
|
client_kwargs = {"protocol": mqtt.MQTTv311}
|
||||||
@@ -1018,12 +1419,18 @@ def main():
|
|||||||
pass
|
pass
|
||||||
client = mqtt.Client(**client_kwargs)
|
client = mqtt.Client(**client_kwargs)
|
||||||
client.on_message = on_message
|
client.on_message = on_message
|
||||||
|
configure_mqtt_security(client)
|
||||||
|
|
||||||
# Enable automatic reconnection
|
# Enable automatic reconnection
|
||||||
client.reconnect_delay_set(min_delay=1, max_delay=120)
|
client.reconnect_delay_set(min_delay=1, max_delay=120)
|
||||||
|
|
||||||
# Connection state tracking
|
# Connection state tracking
|
||||||
connection_state = {"connected": False, "last_disconnect": None}
|
connection_state = {
|
||||||
|
"connected": False,
|
||||||
|
"last_disconnect": None,
|
||||||
|
"reconnect_count": 0,
|
||||||
|
"connect_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
# Optional: Enable MQTT debug logging in DEBUG_MODE
|
# Optional: Enable MQTT debug logging in DEBUG_MODE
|
||||||
if DEBUG_MODE:
|
if DEBUG_MODE:
|
||||||
@@ -1090,6 +1497,9 @@ def main():
|
|||||||
if rc == 0:
|
if rc == 0:
|
||||||
connection_state["connected"] = True
|
connection_state["connected"] = True
|
||||||
connection_state["last_disconnect"] = None
|
connection_state["last_disconnect"] = None
|
||||||
|
connection_state["connect_count"] = connection_state.get("connect_count", 0) + 1
|
||||||
|
if connection_state["connect_count"] > 1:
|
||||||
|
connection_state["reconnect_count"] = connection_state.get("reconnect_count", 0) + 1
|
||||||
|
|
||||||
# Check if this is a reconnection
|
# Check if this is a reconnection
|
||||||
# paho-mqtt v2 provides ConnectFlags with attribute 'session_present'
|
# paho-mqtt v2 provides ConnectFlags with attribute 'session_present'
|
||||||
@@ -1121,6 +1531,11 @@ def main():
|
|||||||
group_id_topic = f"infoscreen/{client_id}/group_id"
|
group_id_topic = f"infoscreen/{client_id}/group_id"
|
||||||
client.subscribe(group_id_topic)
|
client.subscribe(group_id_topic)
|
||||||
logging.info(f"Subscribed to: {group_id_topic}")
|
logging.info(f"Subscribed to: {group_id_topic}")
|
||||||
|
|
||||||
|
# Command topics (canonical + transitional)
|
||||||
|
client.subscribe(command_topic, qos=1)
|
||||||
|
client.subscribe(command_topic_alias, qos=1)
|
||||||
|
logging.info(f"Subscribed to command topics: {command_topic}, {command_topic_alias}")
|
||||||
|
|
||||||
# Wenn beim Start eine group_id vorhanden ist, sofort Event-Topic abonnieren
|
# Wenn beim Start eine group_id vorhanden ist, sofort Event-Topic abonnieren
|
||||||
# Reset event_topic so subscribe_event_topic always re-registers with the broker
|
# Reset event_topic so subscribe_event_topic always re-registers with the broker
|
||||||
@@ -1233,6 +1648,189 @@ def main():
|
|||||||
client.message_callback_add(group_id_topic, on_group_id_message)
|
client.message_callback_add(group_id_topic, on_group_id_message)
|
||||||
logging.info(f"Current group_id at start: {current_group_id if current_group_id else 'none'}")
|
logging.info(f"Current group_id at start: {current_group_id if current_group_id else 'none'}")
|
||||||
|
|
||||||
|
def mark_command_processed(command_id, status, error_code=None):
|
||||||
|
processed_commands[command_id] = {
|
||||||
|
"status": status,
|
||||||
|
"error_code": error_code,
|
||||||
|
"processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
}
|
||||||
|
persist_processed_commands(processed_commands)
|
||||||
|
|
||||||
|
def execute_command_action(action):
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
["sudo", COMMAND_HELPER_PATH, action],
|
||||||
|
timeout=max(1, COMMAND_EXEC_TIMEOUT_SEC),
|
||||||
|
check=False,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return False, "execution_timeout", f"command timed out after {COMMAND_EXEC_TIMEOUT_SEC}s"
|
||||||
|
except PermissionError:
|
||||||
|
return False, "permission_denied_local", "permission denied invoking command helper"
|
||||||
|
except Exception as e:
|
||||||
|
return False, "internal_error", f"internal execution error: {e}"
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
stderr = (proc.stderr or "").strip()
|
||||||
|
if proc.returncode in (126, 127):
|
||||||
|
return False, "permission_denied_local", stderr or "command helper unavailable"
|
||||||
|
return False, "execution_failed", stderr or f"helper exited with code {proc.returncode}"
|
||||||
|
|
||||||
|
return True, None, None
|
||||||
|
|
||||||
|
def on_command_message(client, userdata, msg, properties=None):
|
||||||
|
payload_text = msg.payload.decode().strip()
|
||||||
|
received_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
expires_at = None
|
||||||
|
|
||||||
|
if not payload_text:
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
NIL_COMMAND_ID,
|
||||||
|
"failed",
|
||||||
|
error_code="invalid_schema",
|
||||||
|
error_message="empty command payload",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.loads(payload_text)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
NIL_COMMAND_ID,
|
||||||
|
"failed",
|
||||||
|
error_code="invalid_schema",
|
||||||
|
error_message=f"invalid JSON: {e}",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
command_id_hint = _extract_command_id(payload)
|
||||||
|
if isinstance(payload, dict):
|
||||||
|
expires_at = payload.get("expires_at")
|
||||||
|
|
||||||
|
is_valid, normalized, error_code, error_message = validate_command_payload(payload, client_id)
|
||||||
|
if not is_valid:
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
command_id_hint,
|
||||||
|
"failed",
|
||||||
|
error_code=error_code,
|
||||||
|
error_message=error_message,
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
command_id = normalized["command_id"]
|
||||||
|
expires_at = normalized["expires_at"]
|
||||||
|
action = normalized["action"]
|
||||||
|
|
||||||
|
if command_id in processed_commands:
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
command_id,
|
||||||
|
"failed",
|
||||||
|
error_code="duplicate_command",
|
||||||
|
error_message="command_id already processed",
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
command_id,
|
||||||
|
"accepted",
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
command_id,
|
||||||
|
"execution_started",
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
write_last_command_state({
|
||||||
|
"command_id": command_id,
|
||||||
|
"action": action,
|
||||||
|
"ack_status": "execution_started",
|
||||||
|
"received_at": received_at,
|
||||||
|
"execution_started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"expires_at": expires_at,
|
||||||
|
"source_topic": msg.topic,
|
||||||
|
})
|
||||||
|
|
||||||
|
ok, exec_error_code, exec_error_message = execute_command_action(action)
|
||||||
|
if ok:
|
||||||
|
if command_requires_recovery_completion(action):
|
||||||
|
if command_mock_reboot_immediate_complete_enabled(action):
|
||||||
|
logging.info(
|
||||||
|
"Mock reboot immediate completion enabled: command_id=%s action=%s",
|
||||||
|
command_id,
|
||||||
|
action,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.info(
|
||||||
|
"Command entered recovery completion path: command_id=%s action=%s",
|
||||||
|
command_id,
|
||||||
|
action,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
"Command continuing to immediate completion path: command_id=%s action=%s",
|
||||||
|
command_id,
|
||||||
|
action,
|
||||||
|
)
|
||||||
|
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
command_id,
|
||||||
|
"completed",
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
mark_command_processed(command_id, "completed")
|
||||||
|
write_last_command_state({
|
||||||
|
"command_id": command_id,
|
||||||
|
"action": action,
|
||||||
|
"ack_status": "completed",
|
||||||
|
"completed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"expires_at": expires_at,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
command_id,
|
||||||
|
"failed",
|
||||||
|
error_code=exec_error_code,
|
||||||
|
error_message=exec_error_message,
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
mark_command_processed(command_id, "failed", error_code=exec_error_code)
|
||||||
|
write_last_command_state({
|
||||||
|
"command_id": command_id,
|
||||||
|
"action": action,
|
||||||
|
"ack_status": "failed",
|
||||||
|
"error_code": exec_error_code,
|
||||||
|
"error_message": exec_error_message,
|
||||||
|
"failed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"expires_at": expires_at,
|
||||||
|
})
|
||||||
|
|
||||||
|
client.message_callback_add(command_topic, on_command_message)
|
||||||
|
client.message_callback_add(command_topic_alias, on_command_message)
|
||||||
|
|
||||||
def on_power_intent_message(client, userdata, msg, properties=None):
|
def on_power_intent_message(client, userdata, msg, properties=None):
|
||||||
nonlocal last_power_intent_id, last_power_issued_at
|
nonlocal last_power_intent_id, last_power_issued_at
|
||||||
|
|
||||||
@@ -1396,7 +1994,8 @@ def main():
|
|||||||
# Heartbeat-Loop with connection state monitoring
|
# Heartbeat-Loop with connection state monitoring
|
||||||
last_heartbeat = 0
|
last_heartbeat = 0
|
||||||
logging.info("Entering heartbeat loop (network loop already running in background thread)")
|
logging.info("Entering heartbeat loop (network loop already running in background thread)")
|
||||||
|
_sd_notify("READY=1") # tell systemd the process is fully initialised
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
@@ -1416,7 +2015,31 @@ def main():
|
|||||||
if result.rc == mqtt.MQTT_ERR_SUCCESS:
|
if result.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||||
logging.info("Heartbeat sent.")
|
logging.info("Heartbeat sent.")
|
||||||
# Also send health and screenshot heartbeats
|
# Also send health and screenshot heartbeats
|
||||||
publish_health_message(client, client_id)
|
publish_health_message(client, client_id, connection_state)
|
||||||
|
if (
|
||||||
|
isinstance(pending_recovery_command, dict)
|
||||||
|
and pending_recovery_command.get("ack_status") == "execution_started"
|
||||||
|
and pending_recovery_command.get("action") == "reboot_host"
|
||||||
|
and pending_recovery_command.get("command_id")
|
||||||
|
):
|
||||||
|
recovered_command_id = pending_recovery_command.get("command_id")
|
||||||
|
recovered_expires = pending_recovery_command.get("expires_at")
|
||||||
|
publish_command_ack(
|
||||||
|
client,
|
||||||
|
client_id,
|
||||||
|
recovered_command_id,
|
||||||
|
"completed",
|
||||||
|
expires_at=recovered_expires,
|
||||||
|
)
|
||||||
|
mark_command_processed(recovered_command_id, "completed")
|
||||||
|
write_last_command_state({
|
||||||
|
"command_id": recovered_command_id,
|
||||||
|
"action": pending_recovery_command.get("action"),
|
||||||
|
"ack_status": "recovered",
|
||||||
|
"recovered_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"expires_at": recovered_expires,
|
||||||
|
})
|
||||||
|
pending_recovery_command = None
|
||||||
elif result.rc == mqtt.MQTT_ERR_NO_CONN:
|
elif result.rc == mqtt.MQTT_ERR_NO_CONN:
|
||||||
logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
|
logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
@@ -1424,7 +2047,7 @@ def main():
|
|||||||
retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
|
retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
|
||||||
if retry.rc == mqtt.MQTT_ERR_SUCCESS:
|
if retry.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||||
logging.info("Heartbeat sent after retry.")
|
logging.info("Heartbeat sent after retry.")
|
||||||
publish_health_message(client, client_id)
|
publish_health_message(client, client_id, connection_state)
|
||||||
else:
|
else:
|
||||||
logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
|
logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
|
||||||
else:
|
else:
|
||||||
@@ -1435,6 +2058,7 @@ def main():
|
|||||||
logging.debug("Skipping heartbeat - MQTT not connected (is_connected=False)")
|
logging.debug("Skipping heartbeat - MQTT not connected (is_connected=False)")
|
||||||
last_heartbeat = current_time
|
last_heartbeat = current_time
|
||||||
|
|
||||||
|
_sd_notify("WATCHDOG=1") # kick systemd watchdog each loop iteration
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
logging.info("Shutting down gracefully...")
|
logging.info("Shutting down gracefully...")
|
||||||
|
|||||||
287
tests/test_command_intake.py
Normal file
287
tests/test_command_intake.py
Normal file
@@ -0,0 +1,287 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for reboot/shutdown command intake primitives.
|
||||||
|
|
||||||
|
Run from project root (venv activated):
|
||||||
|
python -m pytest tests/test_command_intake.py -v
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
||||||
|
|
||||||
|
from simclient import ( # noqa: E402
|
||||||
|
NIL_COMMAND_ID,
|
||||||
|
command_requires_recovery_completion,
|
||||||
|
command_mock_reboot_immediate_complete_enabled,
|
||||||
|
configure_mqtt_security,
|
||||||
|
mqtt,
|
||||||
|
validate_command_payload,
|
||||||
|
publish_command_ack,
|
||||||
|
_prune_processed_commands,
|
||||||
|
load_processed_commands,
|
||||||
|
persist_processed_commands,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class FakePublishResult:
|
||||||
|
def __init__(self, rc):
|
||||||
|
self.rc = rc
|
||||||
|
|
||||||
|
|
||||||
|
class FakeMqttClient:
|
||||||
|
def __init__(self, rc=0):
|
||||||
|
self.rc = rc
|
||||||
|
self.calls = []
|
||||||
|
|
||||||
|
def publish(self, topic, payload, qos=0, retain=False):
|
||||||
|
self.calls.append({
|
||||||
|
"topic": topic,
|
||||||
|
"payload": payload,
|
||||||
|
"qos": qos,
|
||||||
|
"retain": retain,
|
||||||
|
})
|
||||||
|
return FakePublishResult(self.rc)
|
||||||
|
|
||||||
|
|
||||||
|
class SequencedMqttClient:
|
||||||
|
def __init__(self, rc_sequence):
|
||||||
|
self._rc_sequence = list(rc_sequence)
|
||||||
|
self.calls = []
|
||||||
|
|
||||||
|
def publish(self, topic, payload, qos=0, retain=False):
|
||||||
|
rc = self._rc_sequence.pop(0) if self._rc_sequence else 0
|
||||||
|
self.calls.append({
|
||||||
|
"topic": topic,
|
||||||
|
"payload": payload,
|
||||||
|
"qos": qos,
|
||||||
|
"retain": retain,
|
||||||
|
"rc": rc,
|
||||||
|
})
|
||||||
|
return FakePublishResult(rc)
|
||||||
|
|
||||||
|
|
||||||
|
class FakeSecurityClient:
|
||||||
|
def __init__(self):
|
||||||
|
self.username = None
|
||||||
|
self.password = None
|
||||||
|
self.tls_kwargs = None
|
||||||
|
self.tls_insecure = None
|
||||||
|
|
||||||
|
def username_pw_set(self, username, password=None):
|
||||||
|
self.username = username
|
||||||
|
self.password = password
|
||||||
|
|
||||||
|
def tls_set(self, **kwargs):
|
||||||
|
self.tls_kwargs = kwargs
|
||||||
|
|
||||||
|
def tls_insecure_set(self, enabled):
|
||||||
|
self.tls_insecure = enabled
|
||||||
|
|
||||||
|
|
||||||
|
def _valid_payload(seconds_valid=240):
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
exp = now + timedelta(seconds=seconds_valid)
|
||||||
|
return {
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": now.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"expires_at": exp.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateCommandPayload(unittest.TestCase):
|
||||||
|
def test_accepts_valid_payload(self):
|
||||||
|
payload = _valid_payload()
|
||||||
|
ok, normalized, code, msg = validate_command_payload(payload, payload["client_uuid"])
|
||||||
|
self.assertTrue(ok)
|
||||||
|
self.assertIsNone(code)
|
||||||
|
self.assertIsNone(msg)
|
||||||
|
self.assertEqual(normalized["action"], "reboot_host")
|
||||||
|
|
||||||
|
def test_rejects_extra_fields(self):
|
||||||
|
payload = _valid_payload()
|
||||||
|
payload["extra"] = "x"
|
||||||
|
ok, _, code, msg = validate_command_payload(payload, payload["client_uuid"])
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertEqual(code, "invalid_schema")
|
||||||
|
self.assertIn("unexpected fields", msg)
|
||||||
|
|
||||||
|
def test_rejects_stale_command(self):
|
||||||
|
payload = _valid_payload()
|
||||||
|
old_issued = datetime.now(timezone.utc) - timedelta(hours=3)
|
||||||
|
old_expires = datetime.now(timezone.utc) - timedelta(hours=2)
|
||||||
|
payload["issued_at"] = old_issued.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
payload["expires_at"] = old_expires.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
ok, _, code, _ = validate_command_payload(payload, payload["client_uuid"])
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertEqual(code, "stale_command")
|
||||||
|
|
||||||
|
def test_rejects_action_outside_enum(self):
|
||||||
|
payload = _valid_payload()
|
||||||
|
payload["action"] = "restart_service"
|
||||||
|
ok, _, code, msg = validate_command_payload(payload, payload["client_uuid"])
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertEqual(code, "invalid_schema")
|
||||||
|
self.assertIn("action must be one of", msg)
|
||||||
|
|
||||||
|
def test_rejects_client_uuid_mismatch(self):
|
||||||
|
payload = _valid_payload()
|
||||||
|
ok, _, code, msg = validate_command_payload(
|
||||||
|
payload,
|
||||||
|
"aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",
|
||||||
|
)
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertEqual(code, "invalid_schema")
|
||||||
|
self.assertIn("client_uuid", msg)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCommandLifecyclePolicy(unittest.TestCase):
|
||||||
|
def test_reboot_requires_recovery_completion(self):
|
||||||
|
self.assertTrue(command_requires_recovery_completion("reboot_host"))
|
||||||
|
self.assertFalse(command_requires_recovery_completion("shutdown_host"))
|
||||||
|
|
||||||
|
def test_mock_reboot_immediate_completion_enabled_for_mock_helper(self):
|
||||||
|
with patch("simclient.COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", True), \
|
||||||
|
patch("simclient.COMMAND_HELPER_PATH", "/home/pi/scripts/mock-command-helper.sh"):
|
||||||
|
self.assertTrue(command_mock_reboot_immediate_complete_enabled("reboot_host"))
|
||||||
|
|
||||||
|
def test_mock_reboot_immediate_completion_disabled_for_live_helper(self):
|
||||||
|
with patch("simclient.COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", True), \
|
||||||
|
patch("simclient.COMMAND_HELPER_PATH", "/usr/local/bin/infoscreen-cmd-helper.sh"):
|
||||||
|
self.assertFalse(command_mock_reboot_immediate_complete_enabled("reboot_host"))
|
||||||
|
|
||||||
|
|
||||||
|
class TestMqttSecurityConfiguration(unittest.TestCase):
|
||||||
|
def test_configure_username_password(self):
|
||||||
|
fake_client = FakeSecurityClient()
|
||||||
|
with patch("simclient.MQTT_USER", ""), \
|
||||||
|
patch("simclient.MQTT_PASSWORD_BROKER", ""), \
|
||||||
|
patch("simclient.MQTT_USERNAME", "client-user"), \
|
||||||
|
patch("simclient.MQTT_PASSWORD", "client-pass"), \
|
||||||
|
patch("simclient.MQTT_TLS_ENABLED", False):
|
||||||
|
configured = configure_mqtt_security(fake_client)
|
||||||
|
|
||||||
|
self.assertEqual(fake_client.username, "client-user")
|
||||||
|
self.assertEqual(fake_client.password, "client-pass")
|
||||||
|
self.assertFalse(configured["tls"])
|
||||||
|
|
||||||
|
def test_configure_tls(self):
|
||||||
|
fake_client = FakeSecurityClient()
|
||||||
|
with patch("simclient.MQTT_USER", ""), \
|
||||||
|
patch("simclient.MQTT_PASSWORD_BROKER", ""), \
|
||||||
|
patch("simclient.MQTT_USERNAME", ""), \
|
||||||
|
patch("simclient.MQTT_PASSWORD", ""), \
|
||||||
|
patch("simclient.MQTT_TLS_ENABLED", True), \
|
||||||
|
patch("simclient.MQTT_TLS_CA_CERT", "/tmp/ca.pem"), \
|
||||||
|
patch("simclient.MQTT_TLS_CERT", "/tmp/client.pem"), \
|
||||||
|
patch("simclient.MQTT_TLS_KEY", "/tmp/client.key"), \
|
||||||
|
patch("simclient.MQTT_TLS_INSECURE", True):
|
||||||
|
configured = configure_mqtt_security(fake_client)
|
||||||
|
|
||||||
|
self.assertTrue(configured["tls"])
|
||||||
|
self.assertEqual(fake_client.tls_kwargs["ca_certs"], "/tmp/ca.pem")
|
||||||
|
self.assertEqual(fake_client.tls_kwargs["certfile"], "/tmp/client.pem")
|
||||||
|
self.assertEqual(fake_client.tls_kwargs["keyfile"], "/tmp/client.key")
|
||||||
|
self.assertTrue(fake_client.tls_insecure)
|
||||||
|
|
||||||
|
|
||||||
|
class TestAckPublish(unittest.TestCase):
|
||||||
|
def test_failed_ack_forces_non_null_error_fields(self):
|
||||||
|
fake_client = FakeMqttClient(rc=0)
|
||||||
|
ok = publish_command_ack(
|
||||||
|
fake_client,
|
||||||
|
"9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
NIL_COMMAND_ID,
|
||||||
|
"failed",
|
||||||
|
error_code=None,
|
||||||
|
error_message=None,
|
||||||
|
)
|
||||||
|
self.assertTrue(ok)
|
||||||
|
self.assertEqual(len(fake_client.calls), 2)
|
||||||
|
payload = json.loads(fake_client.calls[0]["payload"])
|
||||||
|
self.assertEqual(payload["status"], "failed")
|
||||||
|
self.assertTrue(isinstance(payload["error_code"], str) and payload["error_code"])
|
||||||
|
self.assertTrue(isinstance(payload["error_message"], str) and payload["error_message"])
|
||||||
|
|
||||||
|
def test_retry_on_broker_disconnect_then_success(self):
|
||||||
|
# First loop (2 topics): NO_CONN, NO_CONN. Second loop: success, success.
|
||||||
|
fake_client = SequencedMqttClient([
|
||||||
|
mqtt.MQTT_ERR_NO_CONN,
|
||||||
|
mqtt.MQTT_ERR_NO_CONN,
|
||||||
|
mqtt.MQTT_ERR_SUCCESS,
|
||||||
|
mqtt.MQTT_ERR_SUCCESS,
|
||||||
|
])
|
||||||
|
future_expiry = (datetime.now(timezone.utc) + timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
|
with patch("simclient.time.sleep", return_value=None) as sleep_mock:
|
||||||
|
ok = publish_command_ack(
|
||||||
|
fake_client,
|
||||||
|
"9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"accepted",
|
||||||
|
expires_at=future_expiry,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertTrue(ok)
|
||||||
|
self.assertEqual(len(fake_client.calls), 4)
|
||||||
|
sleep_mock.assert_called_once()
|
||||||
|
|
||||||
|
def test_stop_retry_when_expired(self):
|
||||||
|
fake_client = SequencedMqttClient([
|
||||||
|
mqtt.MQTT_ERR_NO_CONN,
|
||||||
|
mqtt.MQTT_ERR_NO_CONN,
|
||||||
|
])
|
||||||
|
past_expiry = (datetime.now(timezone.utc) - timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
|
with patch("simclient.time.sleep", return_value=None) as sleep_mock:
|
||||||
|
ok = publish_command_ack(
|
||||||
|
fake_client,
|
||||||
|
"9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"accepted",
|
||||||
|
expires_at=past_expiry,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertFalse(ok)
|
||||||
|
self.assertEqual(len(fake_client.calls), 2)
|
||||||
|
sleep_mock.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
class TestProcessedCommandsState(unittest.TestCase):
|
||||||
|
def test_prune_keeps_recent_only(self):
|
||||||
|
recent = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
old = (datetime.now(timezone.utc) - timedelta(hours=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
commands = {
|
||||||
|
"a": {"processed_at": recent, "status": "completed"},
|
||||||
|
"b": {"processed_at": old, "status": "completed"},
|
||||||
|
}
|
||||||
|
pruned = _prune_processed_commands(commands)
|
||||||
|
self.assertIn("a", pruned)
|
||||||
|
self.assertNotIn("b", pruned)
|
||||||
|
|
||||||
|
def test_load_and_persist_round_trip(self):
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
state_file = os.path.join(tmpdir, "processed_commands.json")
|
||||||
|
with patch("simclient.PROCESSED_COMMANDS_FILE", state_file):
|
||||||
|
persist_processed_commands({
|
||||||
|
"x": {
|
||||||
|
"status": "completed",
|
||||||
|
"processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
loaded = load_processed_commands()
|
||||||
|
self.assertIn("x", loaded)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user