diff --git a/.env.example b/.env.example index 5aee511..a605145 100644 --- a/.env.example +++ b/.env.example @@ -20,8 +20,18 @@ DB_HOST=db # MQTT MQTT_BROKER_HOST=mqtt MQTT_BROKER_PORT=1883 -# MQTT_USER=your_mqtt_user -# MQTT_PASSWORD=your_mqtt_password +# Required for authenticated broker access +MQTT_USER=your_mqtt_user +MQTT_PASSWORD=replace_with_a_32plus_char_random_password +# Optional: dedicated canary client account +MQTT_CANARY_USER=your_canary_mqtt_user +MQTT_CANARY_PASSWORD=replace_with_a_different_32plus_char_random_password +# Optional TLS settings +MQTT_TLS_ENABLED=false +MQTT_TLS_CA_CERT= +MQTT_TLS_CERTFILE= +MQTT_TLS_KEYFILE= +MQTT_TLS_INSECURE=false MQTT_KEEPALIVE=60 # Dashboard @@ -39,6 +49,12 @@ HEARTBEAT_GRACE_PERIOD_PROD=170 # Optional: force periodic republish even without changes # REFRESH_SECONDS=0 +# Crash recovery (scheduler auto-recovery) +# CRASH_RECOVERY_ENABLED=false +# CRASH_RECOVERY_GRACE_SECONDS=180 +# CRASH_RECOVERY_LOCKOUT_MINUTES=15 +# CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS=240 + # Default superadmin bootstrap (server/init_defaults.py) # REQUIRED: Must be set for superadmin creation DEFAULT_SUPERADMIN_USERNAME=superadmin diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2e85676..4c31ba0 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -13,15 +13,16 @@ It is not a changelog and not a full architecture handbook. - Keep changes minimal, match existing patterns, and update docs in the same commit when behavior changes. ## Fast file map -- `scheduler/scheduler.py` - scheduler loop, MQTT event publishing, TV power intent publishing -- `scheduler/db_utils.py` - event formatting and power-intent helper logic -- `listener/listener.py` - discovery/heartbeat/log/screenshot MQTT consumption +- `scheduler/scheduler.py` - scheduler loop, MQTT event publishing, TV power intent publishing, crash auto-recovery, command expiry sweep +- `scheduler/db_utils.py` - event formatting, power-intent helpers, crash recovery helpers, command expiry sweep +- `listener/listener.py` - discovery/heartbeat/log/screenshot/service_failed MQTT consumption - `server/init_academic_periods.py` - idempotent academic-period seeding + auto-activation for current date - `server/initialize_database.py` - migration + bootstrap orchestration for local/manual setup - `server/routes/events.py` - event CRUD, recurrence handling, UTC normalization - `server/routes/eventmedia.py` - file manager, media upload/stream endpoints - `server/routes/groups.py` - group lifecycle, alive status, order persistence - `server/routes/system_settings.py` - system settings CRUD and supplement-table endpoint +- `server/routes/clients.py` - client metadata, restart/shutdown/restart_app command issuing, command status, crashed/service_failed alert endpoints - `dashboard/src/settings.tsx` - settings UX and system-defaults integration - `dashboard/src/components/CustomEventModal.tsx` - event creation/editing UX - `dashboard/src/monitoring.tsx` - superadmin monitoring page @@ -54,6 +55,9 @@ It is not a changelog and not a full architecture handbook. - Logs topic family: `infoscreen/{uuid}/logs/{error|warn|info}` - Health topic: `infoscreen/{uuid}/health` - Dashboard screenshot topic: `infoscreen/{uuid}/dashboard` +- Client command topic (QoS1, non-retained): `infoscreen/{uuid}/commands` (compat alias: `infoscreen/{uuid}/command`) +- Client command ack topic (QoS1, non-retained): `infoscreen/{uuid}/commands/ack` (compat alias: `infoscreen/{uuid}/command/ack`) +- Service-failed topic (retained, client→server): `infoscreen/{uuid}/service_failed` - TV power intent Phase 1 topic (retained, QoS1): `infoscreen/groups/{group_id}/power/intent` TV power intent Phase 1 rules: @@ -82,7 +86,9 @@ TV power intent Phase 1 rules: - Scheduler: `POLL_INTERVAL_SECONDS`, `REFRESH_SECONDS` - Power intent: `POWER_INTENT_PUBLISH_ENABLED`, `POWER_INTENT_HEARTBEAT_ENABLED`, `POWER_INTENT_EXPIRY_MULTIPLIER`, `POWER_INTENT_MIN_EXPIRY_SECONDS` - Monitoring: `PRIORITY_SCREENSHOT_TTL_SECONDS` +- Crash recovery: `CRASH_RECOVERY_ENABLED`, `CRASH_RECOVERY_GRACE_SECONDS`, `CRASH_RECOVERY_LOCKOUT_MINUTES`, `CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS` - Core: `DB_CONN`, `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_NAME`, `ENV` +- MQTT auth/connectivity: `MQTT_BROKER_HOST`, `MQTT_BROKER_PORT`, `MQTT_USER`, `MQTT_PASSWORD` (listener/scheduler/server should use authenticated broker access) ## Edit guardrails - Do not edit generated assets in `dashboard/dist/`. diff --git a/DEV-CHANGELOG.md b/DEV-CHANGELOG.md index 9580c37..245806e 100644 --- a/DEV-CHANGELOG.md +++ b/DEV-CHANGELOG.md @@ -5,6 +5,31 @@ This changelog tracks all changes made in the development workspace, including i --- ## Unreleased (development workspace) +- Crash detection API: Added `GET /api/clients/crashed` returning clients with `process_status=crashed` or stale heartbeat; includes `crash_reason` field (`process_crashed` | `heartbeat_stale`). +- Crash auto-recovery (scheduler): Feature-flagged loop (`CRASH_RECOVERY_ENABLED`) scans crash candidates, issues `reboot_host` command, publishes to primary + compat MQTT topics; lockout window and expiry configurable via env. +- Command expiry sweep (scheduler): Unconditional per-cycle sweep in `sweep_expired_commands()` marks non-terminal `ClientCommand` rows past `expires_at` as `expired`. +- `restart_app` action registered in `server/routes/clients.py` API action map; sends same command lifecycle as `reboot_host`; safety lockout covers both actions. +- `service_failed` listener: subscribes to `infoscreen/+/service_failed` on every connect; persists `service_failed_at` + `service_failed_unit` to `Client`; empty payload (retain clear) silently ignored. +- Broker connection health: Listener health handler now extracts `broker_connection.reconnect_count` + `broker_connection.last_disconnect_at` and persists to `Client`. +- DB migration `b1c2d3e4f5a6`: adds `service_failed_at`, `service_failed_unit`, `mqtt_reconnect_count`, `mqtt_last_disconnect_at` to `clients` table. +- Model update: `models/models.py` Client class updated with all four new columns. +- `GET /api/clients/service_failed`: lists clients with `service_failed_at` set, admin-or-higher gated. +- `POST /api/clients//clear_service_failed`: clears DB flag and publishes empty retained MQTT to `infoscreen/{uuid}/service_failed`. +- Monitoring overview includes `mqtt_reconnect_count` + `mqtt_last_disconnect_at` per client. +- Frontend monitoring: orange service-failed alert panel (hidden when count=0), auto-refresh 15s, per-row Quittieren action. +- Frontend monitoring: client detail now shows MQTT reconnect count + last disconnect timestamp. +- Frontend types: `ServiceFailedClient`, `ServiceFailedClientsResponse`; helpers `fetchServiceFailedClients()`, `clearServiceFailed()` added to `dashboard/src/apiClients.ts`. +- `MQTT_EVENT_PAYLOAD_GUIDE.md`: added `service_failed` topic contract. +- MQTT auth hardening: Listener and scheduler now connect to broker with env-configured credentials (`MQTT_BROKER_HOST`, `MQTT_BROKER_PORT`, `MQTT_USER`, `MQTT_PASSWORD`) instead of anonymous fixed host/port defaults; optional TLS env toggles added in code path (`MQTT_TLS_*`). +- Broker auth enforcement: `mosquitto/config/mosquitto.conf` now disables anonymous access and enables password-file authentication. `docker-compose.yml` MQTT service now bootstraps/update password entries from env (`MQTT_USER`/`MQTT_PASSWORD`, optional canary user) before starting broker. +- Compose wiring: Added MQTT credential env propagation for listener/scheduler in both base and dev override compose files and switched MQTT healthcheck publish to authenticated mode. +- Backend implementation: Introduced client command lifecycle foundation for remote control in `server/routes/clients.py` with command persistence (`ClientCommand`), schema-based MQTT publish to `infoscreen/{uuid}/commands` (QoS1, non-retained), new endpoints `POST /api/clients//shutdown` and `GET /api/clients/commands/`, and restart safety lockout (`blocked_safety` after 3 restarts in 15 minutes). Added migration `server/alembic/versions/aa12bb34cc56_add_client_commands_table.py` and model updates in `models/models.py`. Restart path keeps transitional legacy MQTT publish to `clients/{uuid}/restart` for compatibility. +- Listener integration: `listener/listener.py` now subscribes to `infoscreen/+/commands/ack` and updates command lifecycle states from client ACK payloads (`accepted`, `execution_started`, `completed`, `failed`). +- Frontend API client prep: Extended `dashboard/src/apiClients.ts` with `ClientCommand` typing and helper calls for lifecycle consumption (`shutdownClient`, `fetchClientCommandStatus`), and updated `restartClient` to accept optional reason payload. +- Contract freeze clarification: implementation-plan docs now explicitly freeze canonical MQTT topics (`infoscreen/{uuid}/commands`, `infoscreen/{uuid}/commands/ack`) and JSON schemas with examples; added transitional singular-topic compatibility aliases (`infoscreen/{uuid}/command`, `infoscreen/{uuid}/command/ack`) in server publish and listener ingest. +- Action value canonicalization: command payload actions are now frozen as host-level values (`reboot_host`, `shutdown_host`). API endpoint mapping is explicit (`/restart` -> `reboot_host`, `/shutdown` -> `shutdown_host`), and docs/examples were updated to remove `restart` payload ambiguity. +- Client helper snippets: Added frozen payload validation artifacts `implementation-plans/reboot-command-payload-schemas.md` and `implementation-plans/reboot-command-payload-schemas.json` (copy-ready snippets plus machine-validated JSON Schema). +- Documentation alignment: Added active reboot implementation handoff docs under `implementation-plans/` and linked them in `README.md` for immediate cross-team access (`reboot-implementation-handoff-share.md`, `reboot-implementation-handoff-client-team.md`, `reboot-kickoff-summary.md`). - Programminfo GUI regression/fix: `dashboard/public/program-info.json` could not be loaded in Programminfo menu due to invalid JSON in the new alpha.16 changelog line (malformed quote in a text entry). Fixed JSON entry and verified file parses correctly again. - Dashboard holiday banner fix: `dashboard/src/dashboard.tsx` — `loadHolidayStatus` now uses a stable `useCallback` with empty deps, preventing repeated re-creation on render. `useEffect` depends only on the stable callback reference. - Dashboard Syncfusion stale-render fix: `MessageComponent` in the holiday banner now receives `key={`${severity}:${text}`}` to force remount when severity or text changes; without this Syncfusion cached stale DOM and the banner did not update reactively. diff --git a/MQTT_EVENT_PAYLOAD_GUIDE.md b/MQTT_EVENT_PAYLOAD_GUIDE.md index d316a54..5424ea0 100644 --- a/MQTT_EVENT_PAYLOAD_GUIDE.md +++ b/MQTT_EVENT_PAYLOAD_GUIDE.md @@ -50,6 +50,91 @@ Contract notes: - Heartbeat republishes keep `intent_id` stable while refreshing `issued_at` and `expires_at`. - Expiry is poll-based: `max(3 x poll_interval_sec, 90)`. +### Service-Failed Notification (client → server, retained) +- **Topic**: `infoscreen/{uuid}/service_failed` +- **QoS**: 1 +- **Retained**: Yes +- **Direction**: client → server +- **Purpose**: Client signals that systemd has exhausted restart attempts (`StartLimitBurst` exceeded) — manual intervention is required. + +Example payload: + +```json +{ + "event": "service_failed", + "unit": "infoscreen-simclient.service", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "failed_at": "2026-04-05T08:00:00Z" +} +``` + +Contract notes: +- Message is retained so the server receives it even after a broker restart. +- Server persists `service_failed_at` and `service_failed_unit` to the `clients` table. +- To clear after resolution: `POST /api/clients//clear_service_failed` — clears the DB flag and publishes an empty retained payload to delete the retained message from the broker. +- Empty payload (empty bytes) on this topic = retain-clear in transit; listener ignores it. + +### Client Command Intent (Phase 1) +- **Topic**: `infoscreen/{uuid}/commands` +- **QoS**: 1 +- **Retained**: No +- **Format**: JSON object +- **Purpose**: Per-client control commands (currently `restart` and `shutdown`) + +Compatibility note: +- During restart transition, server also publishes legacy restart command to `clients/{uuid}/restart` with payload `{ "action": "restart" }`. +- During topic naming transition, server also publishes command payload to `infoscreen/{uuid}/command`. + +Example payload: + +```json +{ + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": "2026-04-03T12:48:10Z", + "expires_at": "2026-04-03T12:52:10Z", + "requested_by": 1, + "reason": "operator_request" +} +``` + +Contract notes: +- Clients must reject stale commands where local UTC time is greater than `expires_at`. +- Clients must deduplicate by `command_id` and never execute a duplicate command twice. +- `schema_version` is required for forward-compatibility. +- Allowed command action values in v1: `reboot_host`, `shutdown_host`, `restart_app`. +- `restart_app` = soft app restart (no OS reboot); `reboot_host` = full OS reboot. +- API mapping for operators: restart endpoint emits `reboot_host`; shutdown endpoint emits `shutdown_host`. + +### Client Command Acknowledgements (Phase 1) +- **Topic**: `infoscreen/{uuid}/commands/ack` +- **QoS**: 1 (recommended) +- **Retained**: No +- **Format**: JSON object +- **Purpose**: Client reports command lifecycle progression back to server + +Compatibility note: +- During topic naming transition, listener also accepts acknowledgements from `infoscreen/{uuid}/command/ack`. + +Example payload: + +```json +{ + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "status": "execution_started", + "error_code": null, + "error_message": null +} +``` + +Allowed `status` values: +- `accepted` +- `execution_started` +- `completed` +- `failed` + ## Message Structure ### General Principles diff --git a/README.md b/README.md index 4dcbf48..b2e4041 100644 --- a/README.md +++ b/README.md @@ -192,9 +192,18 @@ Rollout strategy (Phase 1): - [AI-INSTRUCTIONS-MAINTENANCE.md](AI-INSTRUCTIONS-MAINTENANCE.md) - [DEV-CHANGELOG.md](DEV-CHANGELOG.md) +### Active Implementation Plans +- [implementation-plans/reboot-implementation-handoff-share.md](implementation-plans/reboot-implementation-handoff-share.md) +- [implementation-plans/reboot-implementation-handoff-client-team.md](implementation-plans/reboot-implementation-handoff-client-team.md) +- [implementation-plans/reboot-kickoff-summary.md](implementation-plans/reboot-kickoff-summary.md) + ## API Highlights - Core resources: clients, groups, events, academic periods +- Client command lifecycle: + - `POST /api/clients//restart` + - `POST /api/clients//shutdown` + - `GET /api/clients/commands/` - Holidays: `GET/POST /api/holidays`, `POST /api/holidays/upload`, `PUT/DELETE /api/holidays/` - Media: upload/download/stream + conversion status - Auth: login/logout/change-password diff --git a/RESTART_VALIDATION_CHECKLIST.md b/RESTART_VALIDATION_CHECKLIST.md new file mode 100644 index 0000000..9b3afd4 --- /dev/null +++ b/RESTART_VALIDATION_CHECKLIST.md @@ -0,0 +1,149 @@ +# Restart Validation Checklist + +Purpose: Validate end-to-end restart command flow after MQTT auth hardening. + +## Scope + +- API command issue route: `POST /api/clients/{uuid}/restart` +- MQTT command topic: `infoscreen/{uuid}/commands` (compat: `infoscreen/{uuid}/command`) +- MQTT ACK topic: `infoscreen/{uuid}/commands/ack` (compat: `infoscreen/{uuid}/command/ack`) +- Status API: `GET /api/clients/commands/{command_id}` + +## Preconditions + +- Stack is up and healthy (`db`, `mqtt`, `server`, `listener`, `scheduler`). +- You have an `admin` or `superadmin` account. +- At least one canary client is online and can process restart commands. +- `.env` has valid `MQTT_USER` / `MQTT_PASSWORD`. + +## 1) Open Monitoring Session (MQTT) + +On host/server: + +```bash +set -a +. ./.env +set +a + +mosquitto_sub -h 127.0.0.1 -p 1883 \ + -u "$MQTT_USER" -P "$MQTT_PASSWORD" \ + -t "infoscreen/+/commands" \ + -t "infoscreen/+/commands/ack" \ + -t "infoscreen/+/command" \ + -t "infoscreen/+/command/ack" \ + -v +``` + +Expected: +- Command publish appears on `infoscreen/{uuid}/commands`. +- ACK(s) appear on `infoscreen/{uuid}/commands/ack`. + +## 2) Login and Keep Session Cookie + +```bash +API_BASE="http://127.0.0.1:8000" +USER="" +PASS="" + +curl -sS -X POST "$API_BASE/api/auth/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"$USER\",\"password\":\"$PASS\"}" \ + -c /tmp/infoscreen-cookies.txt +``` + +Expected: +- Login success response. +- Cookie jar file created at `/tmp/infoscreen-cookies.txt`. + +## 3) Pick Target Client UUID + +Option A: Use known canary UUID. + +Option B: query alive clients: + +```bash +curl -sS "$API_BASE/api/clients/with_alive_status" -b /tmp/infoscreen-cookies.txt +``` + +Choose one `uuid` where `is_alive` is `true`. + +## 4) Issue Restart Command + +```bash +CLIENT_UUID="" + +curl -sS -X POST "$API_BASE/api/clients/$CLIENT_UUID/restart" \ + -H "Content-Type: application/json" \ + -b /tmp/infoscreen-cookies.txt \ + -d '{"reason":"canary_restart_validation"}' +``` + +Expected: +- HTTP `202` on success. +- JSON includes `command.commandId` and initial status around `published`. +- In MQTT monitor, a command payload with: + - `schema_version: "1.0"` + - `action: "reboot_host"` + - matching `command_id`. + +## 5) Poll Command Lifecycle Until Terminal + +```bash +COMMAND_ID="" + +for i in $(seq 1 20); do + curl -sS "$API_BASE/api/clients/commands/$COMMAND_ID" -b /tmp/infoscreen-cookies.txt + echo + sleep 3 +done +``` + +Expected status progression (typical): +- `queued` -> `publish_in_progress` -> `published` -> `ack_received` -> `execution_started` -> `completed` + +Failure/alternate terminal states: +- `failed` (check `errorCode` / `errorMessage`) +- `blocked_safety` (reboot lockout triggered) + +## 6) Validate Offline/Timeout Behavior + +- Repeat step 4 for an offline client (or stop client process first). +- Confirm command does not falsely end as `completed`. +- Confirm status remains non-success and has usable failure diagnostics. + +## 7) Validate Safety Lockout + +Current lockout in API route: +- Threshold: 3 reboot commands +- Window: 15 minutes + +Test: +- Send 4 restart commands quickly for same `uuid`. + +Expected: +- One request returns HTTP `429`. +- Command entry state `blocked_safety` with lockout error details. + +## 8) Service Log Spot Check + +```bash +docker compose logs --tail=150 server listener mqtt +``` + +Expected: +- No MQTT auth errors (`Not authorized`, `Connection Refused: not authorised`). +- Listener logs show ACK processing for `command_id`. + +## 9) Acceptance Criteria + +- Restart command publish is visible on MQTT. +- ACK is received and mapped by listener. +- Status endpoint reaches correct terminal state. +- Safety lockout works under repeated restart attempts. +- No auth regression in broker/service logs. + +## Cleanup + +```bash +rm -f /tmp/infoscreen-cookies.txt +``` diff --git a/TECH-CHANGELOG.md b/TECH-CHANGELOG.md index 0c80098..a37bb5f 100644 --- a/TECH-CHANGELOG.md +++ b/TECH-CHANGELOG.md @@ -5,6 +5,42 @@ This changelog documents technical and developer-relevant changes included in public releases. For development workspace changes, see DEV-CHANGELOG.md. Not all changes here are reflected in the user-facing changelog (`program-info.json`), and not all UI/feature changes are repeated here. Some changes (e.g., backend refactoring, API adjustments, infrastructure, developer tooling, or internal logic) may only appear in TECH-CHANGELOG.md. For UI/feature changes, see `dashboard/public/program-info.json`. +## Unreleased +- � **Crash detection, auto-recovery, and service_failed monitoring (2026-04-05)**: + - Added `GET /api/clients/crashed` endpoint: returns active clients with `process_status=crashed` or stale heartbeat beyond grace period, with `crash_reason` field. + - Added `restart_app` command action alongside existing `reboot_host`/`shutdown_host`; registered in `server/routes/clients.py` with same safety lockout. + - Scheduler: Added crash auto-recovery loop (feature-flagged via `CRASH_RECOVERY_ENABLED`): scans candidates via `get_crash_recovery_candidates()`, issues `reboot_host` command per client, publishes to primary + compat MQTT topics, updates command lifecycle. + - Scheduler: Added unconditional command expiry sweep each poll cycle via `sweep_expired_commands()` in `scheduler/db_utils.py`: marks non-terminal `ClientCommand` rows with `expires_at < now` as `expired`. + - Added `service_failed` topic ingestion in `listener/listener.py`: subscribe to `infoscreen/+/service_failed` on every connect; persist `service_failed_at` and `service_failed_unit` on Client; empty payload (retain clear) ignored. + - Added `broker_connection` block extraction in health payload handler: persists `mqtt_reconnect_count` and `mqtt_last_disconnect_at` from `infoscreen/{uuid}/health`. + - Added four new DB columns to `clients` table via migration `b1c2d3e4f5a6`: `service_failed_at`, `service_failed_unit`, `mqtt_reconnect_count`, `mqtt_last_disconnect_at`. + - Added `GET /api/clients/service_failed` endpoint: lists clients with `service_failed_at` set, ordered by event time desc. + - Added `POST /api/clients//clear_service_failed` endpoint: clears DB flag and publishes empty retained MQTT message to clear `infoscreen/{uuid}/service_failed`. + - Monitoring overview API (`GET /api/client-logs/monitoring-overview`) now includes `mqtt_reconnect_count` and `mqtt_last_disconnect_at` per client. + - Frontend: Added orange service-failed alert panel to monitoring page (hidden when empty, auto-refresh 15s, per-row Quittieren button with loading/success/error states). + - Frontend: Client detail panel in monitoring now shows MQTT reconnect count and last disconnect timestamp. + - Frontend: Added `ServiceFailedClient`, `ServiceFailedClientsResponse` types; `fetchServiceFailedClients()` and `clearServiceFailed()` API helpers in `dashboard/src/apiClients.ts`. + - Added `service_failed` topic contract to `MQTT_EVENT_PAYLOAD_GUIDE.md`. +- �🔐 **MQTT auth hardening for server-side services (2026-04-03)**: + - `listener/listener.py` now uses env-based broker connectivity for host/port and credentials (`MQTT_BROKER_HOST`, `MQTT_BROKER_PORT`, `MQTT_USER`, `MQTT_PASSWORD`) instead of anonymous fixed `mqtt:1883`. + - `scheduler/scheduler.py` now uses the same env-based MQTT auth path and optional TLS toggles (`MQTT_TLS_ENABLED`, `MQTT_TLS_CA_CERT`, `MQTT_TLS_CERTFILE`, `MQTT_TLS_KEYFILE`, `MQTT_TLS_INSECURE`). + - `docker-compose.yml` and `docker-compose.override.yml` now pass MQTT credentials into listener and scheduler containers for consistent authenticated connections. + - Mosquitto is now configured for authenticated access (`allow_anonymous false`, `password_file /mosquitto/config/passwd`) and bootstraps credentials from env at container startup. + - MQTT healthcheck publish now authenticates with configured broker credentials. +- 🔁 **Client command lifecycle foundation (restart/shutdown) (2026-04-03)**: + - Added persistent command tracking model `ClientCommand` in `models/models.py` and Alembic migration `aa12bb34cc56_add_client_commands_table.py`. + - Upgraded `POST /api/clients//restart` from fire-and-forget publish to lifecycle-aware command issuance with command metadata (`command_id`, `issued_at`, `expires_at`, `reason`, `requested_by`). + - Added `POST /api/clients//shutdown` endpoint with the same lifecycle contract. + - Added `GET /api/clients/commands/` status endpoint for command-state polling. + - Added restart safety lockout in API path: max 3 restart commands per client in rolling 15 minutes, returning `blocked_safety` when threshold is exceeded. + - Added command MQTT publish to `infoscreen/{uuid}/commands` (QoS1, non-retained) and temporary legacy restart compatibility publish to `clients/{uuid}/restart`. + - Added temporary topic compatibility publish to `infoscreen/{uuid}/command` and listener acceptance of `infoscreen/{uuid}/command/ack` to bridge singular/plural naming assumptions. + - Canonicalized command payload action values to host-level semantics: `reboot_host` and `shutdown_host` (API routes remain `/restart` and `/shutdown` for operator UX compatibility). + - Added frozen payload validation snippets for integration/client tooling in `implementation-plans/reboot-command-payload-schemas.md` and `implementation-plans/reboot-command-payload-schemas.json`. + - Listener now subscribes to `infoscreen/{uuid}/commands/ack` and maps client acknowledgements into command lifecycle states (`ack_received`, `execution_started`, `completed`, `failed`). + - Initial lifecycle statuses implemented server-side: `queued`, `publish_in_progress`, `published`, `failed`, and `blocked_safety`. + - Frontend API helper extended in `dashboard/src/apiClients.ts` with `ClientCommand` typing plus command APIs for shutdown and status polling preparation. + ## 2026.1.0-alpha.16 (2026-04-02) - 🐛 **Dashboard holiday banner refactoring and state fix (`dashboard/src/dashboard.tsx`)**: - **Motivation — unstable fetch function:** `loadHolidayStatus` had `location.pathname` in its `useCallback` dependency array, causing a new function reference to be created on every navigation event. The `useEffect` depending on that reference then re-fired, producing overlapping API calls at mount that cancelled each other via the request-sequence guard, leaving the banner unresolved. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..7d61ab3 --- /dev/null +++ b/TODO.md @@ -0,0 +1,55 @@ +# TODO + +## MQTT TLS Hardening (Production) + +- [ ] Enable TLS listener in `mosquitto/config/mosquitto.conf` (e.g., port 8883) while keeping 1883 only for temporary migration if needed. +- [ ] Generate and deploy server certificate + private key for Mosquitto (CA-signed or internal PKI). +- [ ] Add CA certificate distribution strategy for all clients and services (server, listener, scheduler, external monitors). +- [ ] Set strict file permissions for cert/key material (`chmod 600` for keys, least-privilege ownership). +- [ ] Update Docker Compose MQTT service to mount TLS cert/key/CA paths read-only. +- [ ] Add environment variables for TLS in `.env` / `.env.example`: + - `MQTT_TLS_ENABLED=true` + - `MQTT_TLS_CA_CERT=` + - `MQTT_TLS_CERTFILE=` (if mutual TLS used) + - `MQTT_TLS_KEYFILE=` (if mutual TLS used) + - `MQTT_TLS_INSECURE=false` +- [ ] Switch internal services to TLS connection settings and verify authenticated reconnect behavior. +- [ ] Decide policy: TLS-only auth (username/password over TLS) vs mutual TLS + username/password. +- [ ] Disable non-TLS listener (1883) after all clients migrated. +- [ ] Restrict MQTT firewall ingress to trusted source ranges only. +- [ ] Add Mosquitto ACL file for topic-level permissions per role/client type. +- [ ] Add cert rotation process (renewal schedule, rollout, rollback steps). +- [ ] Add monitoring/alerting for certificate expiry and broker auth failures. +- [ ] Add runbook section for external monitoring clients (how to connect with CA validation). +- [ ] Perform a staged rollout (canary group first), then full migration. +- [ ] Document final TLS contract in `MQTT_EVENT_PAYLOAD_GUIDE.md` and deployment docs. + +## Client Recovery Paths + +### Path 1 — Software running → restart via MQTT ✅ +- Server-side fully implemented (`restart_app` action, command lifecycle, monitoring panel). +- [ ] Client team: handle `restart_app` action in command handler (soft app restart, no reboot). + +### Path 2 — Software crashed → MQTT unavailable +- Robust solution is **systemd `Restart=always`** (or `Restart=on-failure`) on the client device — no server involvement, OS init system restarts the process automatically. +- Server detects the crash via missing heartbeat (`process_status=crashed`), records it, and shows it in the monitoring panel. Recovery is confirmed when heartbeats resume. +- [ ] Client team: ensure the infoscreen service unit has `Restart=always` and `RestartSec=` configured in its systemd unit file. +- [ ] Evaluate whether MQTT `clean_session=False` + fixed `client_id` is worth adding for cases where the app crashes but the MQTT connection briefly survives (would allow QoS1 command delivery on reconnect). +- Note: the existing scheduler crash recovery (`reboot_host` via MQTT) is unreliable for a fully crashed app unless the client uses a persistent MQTT session. Revisit if client team enables `clean_session=False`. + +### Path 3 — OS crashed / hung → power cycle needed (customer-dependent) +- No software-based recovery path is possible when the OS is unresponsive. +- Recovery requires external hardware intervention; options depend on customer infrastructure: + - Smart plug / PDU with API (e.g., Shelly, Tasmota, APC, Raritan) + - IPMI / iDRAC / BMC (server-class hardware) + - CEC power command from another device on the same HDMI chain + - Wake-on-LAN after a scheduled power-cut (limited applicability) +- [ ] Clarify with customer which hardware is available / acceptable. +- [ ] If a smart plug or PDU API is chosen: design a server-side "hard power cycle" command type and integration (out of scope until hardware is confirmed). +- [ ] Document chosen solution and integrate into monitoring runbook once decided. + +## Optional Security Follow-ups + +- [ ] Move MQTT credentials to Docker secrets or a vault-backed secret source. +- [ ] Rotate `MQTT_USER`/`MQTT_PASSWORD` on a fixed schedule. +- [ ] Add fail2ban/rate-limiting protections for exposed broker ports. diff --git a/dashboard/index.html b/dashboard/index.html index e4b78ea..14bed27 100644 --- a/dashboard/index.html +++ b/dashboard/index.html @@ -2,9 +2,9 @@ - + - Vite + React + TS + Infoscreen
diff --git a/dashboard/public/favicon.png b/dashboard/public/favicon.png new file mode 100644 index 0000000..0ffb524 Binary files /dev/null and b/dashboard/public/favicon.png differ diff --git a/dashboard/public/program-info.json b/dashboard/public/program-info.json index 64a84b8..32ca43b 100644 --- a/dashboard/public/program-info.json +++ b/dashboard/public/program-info.json @@ -25,10 +25,6 @@ { "name": "Alembic", "license": "MIT" } ] }, - "buildInfo": { - "buildDate": "2025-12-29T12:00:00Z", - "commitId": "9f2ae8b44c3a" - }, "changelog": [ { "version": "2026.1.0-alpha.16", diff --git a/dashboard/src/apiClientMonitoring.ts b/dashboard/src/apiClientMonitoring.ts index 7254dba..c8772a8 100644 --- a/dashboard/src/apiClientMonitoring.ts +++ b/dashboard/src/apiClientMonitoring.ts @@ -39,6 +39,8 @@ export interface MonitoringClient { }; latestLog?: MonitoringLogEntry | null; latestError?: MonitoringLogEntry | null; + mqttReconnectCount?: number | null; + mqttLastDisconnectAt?: string | null; } export interface MonitoringOverview { diff --git a/dashboard/src/apiClients.ts b/dashboard/src/apiClients.ts index 11095b2..777f56b 100644 --- a/dashboard/src/apiClients.ts +++ b/dashboard/src/apiClients.ts @@ -24,6 +24,62 @@ export interface Group { is_active?: boolean; clients: Client[]; } + +export interface CrashedClient { + uuid: string; + description?: string | null; + hostname?: string | null; + ip?: string | null; + group_id?: number | null; + is_alive: boolean; + process_status?: string | null; + screen_health_status?: string | null; + last_alive?: string | null; + crash_reason: 'process_crashed' | 'heartbeat_stale'; +} + +export interface CrashedClientsResponse { + crashed_count: number; + grace_period_seconds: number; + clients: CrashedClient[]; +} + +export interface ServiceFailedClient { + uuid: string; + description?: string | null; + hostname?: string | null; + ip?: string | null; + group_id?: number | null; + is_alive: boolean; + last_alive?: string | null; + service_failed_at: string; + service_failed_unit?: string | null; +} + +export interface ServiceFailedClientsResponse { + service_failed_count: number; + clients: ServiceFailedClient[]; +} + +export interface ClientCommand { + commandId: string; + clientUuid: string; + action: 'reboot_host' | 'shutdown_host' | 'restart_app'; + status: string; + reason?: string | null; + requestedBy?: number | null; + issuedAt?: string | null; + expiresAt?: string | null; + publishedAt?: string | null; + ackedAt?: string | null; + executionStartedAt?: string | null; + completedAt?: string | null; + failedAt?: string | null; + errorCode?: string | null; + errorMessage?: string | null; + createdAt?: string | null; + updatedAt?: string | null; +} // Liefert alle Gruppen mit zugehörigen Clients export async function fetchGroupsWithClients(): Promise { const response = await fetch('/api/groups/with_clients'); @@ -79,9 +135,11 @@ export async function updateClient(uuid: string, data: { description?: string; m return await res.json(); } -export async function restartClient(uuid: string): Promise<{ success: boolean; message?: string }> { +export async function restartClient(uuid: string, reason?: string): Promise<{ success: boolean; message?: string; command?: ClientCommand }> { const response = await fetch(`/api/clients/${uuid}/restart`, { method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ reason: reason || null }), }); if (!response.ok) { const error = await response.json(); @@ -90,6 +148,58 @@ export async function restartClient(uuid: string): Promise<{ success: boolean; m return await response.json(); } +export async function shutdownClient(uuid: string, reason?: string): Promise<{ success: boolean; message?: string; command?: ClientCommand }> { + const response = await fetch(`/api/clients/${uuid}/shutdown`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ reason: reason || null }), + }); + if (!response.ok) { + const error = await response.json(); + throw new Error(error.error || 'Fehler beim Herunterfahren des Clients'); + } + return await response.json(); +} + +export async function fetchClientCommandStatus(commandId: string): Promise { + const response = await fetch(`/api/clients/commands/${commandId}`); + if (!response.ok) { + const error = await response.json(); + throw new Error(error.error || 'Fehler beim Laden des Command-Status'); + } + return await response.json(); +} + +export async function fetchCrashedClients(): Promise { + const response = await fetch('/api/clients/crashed', { credentials: 'include' }); + if (!response.ok) { + const err = await response.json().catch(() => ({})); + throw new Error(err.error || 'Fehler beim Laden der abgestürzten Clients'); + } + return await response.json(); +} + +export async function fetchServiceFailedClients(): Promise { + const response = await fetch('/api/clients/service_failed', { credentials: 'include' }); + if (!response.ok) { + const err = await response.json().catch(() => ({})); + throw new Error(err.error || 'Fehler beim Laden der service_failed Clients'); + } + return await response.json(); +} + +export async function clearServiceFailed(uuid: string): Promise<{ success: boolean; message?: string }> { + const response = await fetch(`/api/clients/${uuid}/clear_service_failed`, { + method: 'POST', + credentials: 'include', + }); + if (!response.ok) { + const err = await response.json().catch(() => ({})); + throw new Error(err.error || 'Fehler beim Quittieren des service_failed Flags'); + } + return await response.json(); +} + export async function deleteClient(uuid: string) { const res = await fetch(`/api/clients/${uuid}`, { method: 'DELETE', diff --git a/dashboard/src/monitoring.css b/dashboard/src/monitoring.css index 0d3c45e..16f91ee 100644 --- a/dashboard/src/monitoring.css +++ b/dashboard/src/monitoring.css @@ -370,4 +370,49 @@ .monitoring-log-dialog-actions { padding: 0 0.2rem 0.4rem; } +} + +/* Crash recovery panel */ +.monitoring-crash-panel { + border-left: 4px solid #dc2626; + margin-bottom: 1.5rem; +} + +.monitoring-service-failed-panel { + border-left: 4px solid #ea580c; + margin-bottom: 1.5rem; +} + +.monitoring-crash-table { + width: 100%; + border-collapse: collapse; + font-size: 0.875rem; +} + +.monitoring-crash-table th { + text-align: left; + padding: 0.5rem 0.75rem; + font-weight: 600; + color: #64748b; + border-bottom: 1px solid #e2e8f0; + background: #f8fafc; +} + +.monitoring-crash-table td { + padding: 0.55rem 0.75rem; + border-bottom: 1px solid #f1f5f9; + vertical-align: middle; +} + +.monitoring-crash-table tr:last-child td { + border-bottom: none; +} + +.monitoring-crash-table tr:hover td { + background: #fef2f2; +} + +.monitoring-meta-hint { + color: #94a3b8; + font-size: 0.8rem; } \ No newline at end of file diff --git a/dashboard/src/monitoring.tsx b/dashboard/src/monitoring.tsx index 7ba6f75..b1ec075 100644 --- a/dashboard/src/monitoring.tsx +++ b/dashboard/src/monitoring.tsx @@ -7,6 +7,16 @@ import { type MonitoringLogEntry, type MonitoringOverview, } from './apiClientMonitoring'; +import { + fetchCrashedClients, + fetchServiceFailedClients, + clearServiceFailed, + restartClient, + type CrashedClient, + type CrashedClientsResponse, + type ServiceFailedClient, + type ServiceFailedClientsResponse, +} from './apiClients'; import { useAuth } from './useAuth'; import { ButtonComponent } from '@syncfusion/ej2-react-buttons'; import { DropDownListComponent } from '@syncfusion/ej2-react-dropdowns'; @@ -156,6 +166,12 @@ const MonitoringDashboard: React.FC = () => { const [screenshotErrored, setScreenshotErrored] = React.useState(false); const selectedClientUuidRef = React.useRef(null); const [selectedLogEntry, setSelectedLogEntry] = React.useState(null); + const [crashedClients, setCrashedClients] = React.useState(null); + const [restartStates, setRestartStates] = React.useState>({}); + const [restartErrors, setRestartErrors] = React.useState>({}); + const [serviceFailedClients, setServiceFailedClients] = React.useState(null); + const [clearStates, setClearStates] = React.useState>({}); + const [clearErrors, setClearErrors] = React.useState>({}); const selectedClient = React.useMemo(() => { if (!overview || !selectedClientUuid) return null; @@ -197,9 +213,37 @@ const MonitoringDashboard: React.FC = () => { } }, []); + const loadCrashedClients = React.useCallback(async () => { + try { + const data = await fetchCrashedClients(); + setCrashedClients(data); + } catch { + // non-fatal: crashes panel just stays stale + } + }, []); + + const loadServiceFailedClients = React.useCallback(async () => { + try { + const data = await fetchServiceFailedClients(); + setServiceFailedClients(data); + } catch { + // non-fatal + } + }, []); + React.useEffect(() => { loadOverview(hours, false); - }, [hours, loadOverview]); + loadCrashedClients(); + loadServiceFailedClients(); + }, [hours, loadOverview, loadCrashedClients, loadServiceFailedClients]); + + React.useEffect(() => { + const id = window.setInterval(() => { + loadCrashedClients(); + loadServiceFailedClients(); + }, REFRESH_INTERVAL_MS); + return () => window.clearInterval(id); + }, [loadCrashedClients, loadServiceFailedClients]); React.useEffect(() => { const hasActivePriorityScreenshots = (overview?.summary.activePriorityScreenshots || 0) > 0; @@ -308,6 +352,194 @@ const MonitoringDashboard: React.FC = () => { {renderMetricCard('Fehler-Logs', overview?.summary.errorLogs || 0, 'Im gewählten Zeitraum', '#b91c1c')} + {crashedClients && crashedClients.crashed_count > 0 && ( +
+
+

+ Abgestürzte / Nicht erreichbare Clients +

+ + {crashedClients.crashed_count} + +
+ + + + + + + + + + + + + {crashedClients.clients.map((c: CrashedClient) => { + const state = restartStates[c.uuid] || 'idle'; + const errMsg = restartErrors[c.uuid]; + const displayName = c.description || c.hostname || c.uuid; + return ( + + + + + + + + + ); + })} + +
ClientGruppeUrsacheProzessstatusLetztes SignalAktion
+ {displayName} + {c.ip && ({c.ip})} + {c.group_id ?? '—'} + + {c.crash_reason === 'process_crashed' ? 'Prozess abgestürzt' : 'Heartbeat veraltet'} + + {c.process_status || '—'}{formatRelative(c.last_alive)} + {state === 'loading' && Wird gesendet…} + {state === 'success' && ✓ Neustart gesendet} + {state === 'failed' && ( + + ✗ Fehler + + )} + {(state === 'idle' || state === 'failed') && ( + { + setRestartStates(prev => ({ ...prev, [c.uuid]: 'loading' })); + setRestartErrors(prev => { const n = { ...prev }; delete n[c.uuid]; return n; }); + try { + await restartClient(c.uuid, c.crash_reason); + setRestartStates(prev => ({ ...prev, [c.uuid]: 'success' })); + setTimeout(() => { + setRestartStates(prev => ({ ...prev, [c.uuid]: 'idle' })); + loadCrashedClients(); + }, 8000); + } catch (e) { + const msg = e instanceof Error ? e.message : 'Unbekannter Fehler'; + setRestartStates(prev => ({ ...prev, [c.uuid]: 'failed' })); + setRestartErrors(prev => ({ ...prev, [c.uuid]: msg })); + } + }} + > + Neustart + + )} +
+
+ )} + + {serviceFailedClients && serviceFailedClients.service_failed_count > 0 && ( +
+
+

+ Service dauerhaft ausgefallen (systemd hat aufgegeben) +

+ + {serviceFailedClients.service_failed_count} + +
+

+ Diese Clients konnten von systemd nicht mehr automatisch neugestartet werden. + Manuelle Intervention erforderlich. Nach Behebung bitte quittieren. +

+ + + + + + + + + + + + + {serviceFailedClients.clients.map((c: ServiceFailedClient) => { + const state = clearStates[c.uuid] || 'idle'; + const errMsg = clearErrors[c.uuid]; + const displayName = c.description || c.hostname || c.uuid; + const failedAt = c.service_failed_at + ? new Date(c.service_failed_at.endsWith('Z') ? c.service_failed_at : c.service_failed_at + 'Z').toLocaleString('de-DE') + : '—'; + return ( + + + + + + + + + ); + })} + +
ClientGruppeUnitAusgefallen amLetztes SignalAktion
+ {displayName} + {c.ip && ({c.ip})} + {c.group_id ?? '—'}{c.service_failed_unit || '—'}{failedAt}{formatRelative(c.last_alive)} + {state === 'loading' && Wird quittiert…} + {state === 'success' && ✓ Quittiert} + {state === 'failed' && ( + ✗ Fehler + )} + {(state === 'idle' || state === 'failed') && ( + { + setClearStates(prev => ({ ...prev, [c.uuid]: 'loading' })); + setClearErrors(prev => { const n = { ...prev }; delete n[c.uuid]; return n; }); + try { + await clearServiceFailed(c.uuid); + setClearStates(prev => ({ ...prev, [c.uuid]: 'success' })); + setTimeout(() => { + setClearStates(prev => ({ ...prev, [c.uuid]: 'idle' })); + loadServiceFailedClients(); + }, 4000); + } catch (e) { + const msg = e instanceof Error ? e.message : 'Unbekannter Fehler'; + setClearStates(prev => ({ ...prev, [c.uuid]: 'failed' })); + setClearErrors(prev => ({ ...prev, [c.uuid]: msg })); + } + }} + > + Quittieren + + )} +
+
+ )} + {loading && !overview ? ( ) : ( @@ -393,6 +625,16 @@ const MonitoringDashboard: React.FC = () => { Bildschirmstatus {selectedClient.screenHealthStatus || 'UNKNOWN'} +
+ MQTT Reconnects + {selectedClient.mqttReconnectCount != null ? selectedClient.mqttReconnectCount : '—'} +
+ {selectedClient.mqttLastDisconnectAt && ( +
+ Letzter Disconnect + {formatTimestamp(selectedClient.mqttLastDisconnectAt)} +
+ )}
Letzte Analyse {formatTimestamp(selectedClient.lastScreenshotAnalyzed)} diff --git a/dashboard/src/programminfo.tsx b/dashboard/src/programminfo.tsx index 04f7c0a..c1a112a 100644 --- a/dashboard/src/programminfo.tsx +++ b/dashboard/src/programminfo.tsx @@ -12,10 +12,6 @@ interface ProgramInfo { frontend: { name: string; license: string }[]; backend: { name: string; license: string }[]; }; - buildInfo: { - buildDate: string; - commitId: string; - }; changelog: { version: string; date: string; @@ -85,30 +81,30 @@ const Programminfo: React.FC = () => {
-
-

- Version: {info.version} -

-

- Copyright: {info.copyright} -

-

+

+
Version: {info.version}
+
Copyright: {info.copyright}
+
Support:{' '} {info.supportContact} -

-
-

Build-Informationen

-

- Build-Datum: {new Date(info.buildInfo.buildDate).toLocaleString('de-DE')} -

-

- Commit-ID:{' '} +

+
+
Build-Informationen
+
Build-Datum: {new Date(__BUILD_DATE__).toLocaleString('de-DE')}
+
+ Umgebung:{' '} - {info.buildInfo.commitId} + {__BUILD_ENV__} -

+
+
+ Node.js:{' '} + + {__NODE_VERSION__} + +
diff --git a/dashboard/src/vite-env.d.ts b/dashboard/src/vite-env.d.ts index 11f02fe..723020a 100644 --- a/dashboard/src/vite-env.d.ts +++ b/dashboard/src/vite-env.d.ts @@ -1 +1,5 @@ /// + +declare const __BUILD_DATE__: string; +declare const __NODE_VERSION__: string; +declare const __BUILD_ENV__: string; diff --git a/dashboard/vite.config.ts b/dashboard/vite.config.ts index 2cc4125..934e61f 100644 --- a/dashboard/vite.config.ts +++ b/dashboard/vite.config.ts @@ -6,6 +6,11 @@ import react from '@vitejs/plugin-react'; export default defineConfig({ cacheDir: './.vite', plugins: [react()], + define: { + __BUILD_DATE__: JSON.stringify(new Date().toISOString()), + __NODE_VERSION__: JSON.stringify(process.version), + __BUILD_ENV__: JSON.stringify(process.env.NODE_ENV ?? 'development'), + }, resolve: { // 🔧 KORRIGIERT: Entferne die problematischen Aliases komplett // Diese verursachen das "not an absolute path" Problem diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 8131fa2..8adce3b 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -45,15 +45,37 @@ services: image: eclipse-mosquitto:2.0.21 container_name: infoscreen-mqtt restart: unless-stopped + command: > + sh -c 'set -eu; + : "$${MQTT_USER:?MQTT_USER not set}"; + : "$${MQTT_PASSWORD:?MQTT_PASSWORD not set}"; + touch /mosquitto/config/passwd; + chmod 600 /mosquitto/config/passwd; + mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_USER}" "$${MQTT_PASSWORD}"; + if [ -n "$${MQTT_CANARY_USER:-}" ] && [ -n "$${MQTT_CANARY_PASSWORD:-}" ]; then + mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_CANARY_USER}" "$${MQTT_CANARY_PASSWORD}"; + fi; + exec mosquitto -c /mosquitto/config/mosquitto.conf' volumes: - - ./mosquitto/config/mosquitto.conf:/mosquitto/config/mosquitto.conf:ro + - ./mosquitto/config:/mosquitto/config + - ./mosquitto/data:/mosquitto/data + - ./mosquitto/log:/mosquitto/log ports: - "1883:1883" - "9001:9001" + environment: + - MQTT_USER=${MQTT_USER} + - MQTT_PASSWORD=${MQTT_PASSWORD} + - MQTT_CANARY_USER=${MQTT_CANARY_USER:-} + - MQTT_CANARY_PASSWORD=${MQTT_CANARY_PASSWORD:-} networks: - infoscreen-net healthcheck: - test: ["CMD-SHELL", "mosquitto_pub -h localhost -t test -m 'health' || exit 1"] + test: + [ + "CMD-SHELL", + "mosquitto_pub -h localhost -u $$MQTT_USER -P $$MQTT_PASSWORD -t test -m 'health' || exit 1", + ] interval: 30s timeout: 5s retries: 3 @@ -125,6 +147,11 @@ services: DB_PASSWORD: ${DB_PASSWORD} DB_NAME: ${DB_NAME} DB_ROOT_PASSWORD: ${DB_ROOT_PASSWORD} + API_BASE_URL: http://server:8000 + MQTT_BROKER_HOST: ${MQTT_BROKER_HOST:-mqtt} + MQTT_BROKER_PORT: ${MQTT_BROKER_PORT:-1883} + MQTT_USER: ${MQTT_USER} + MQTT_PASSWORD: ${MQTT_PASSWORD} networks: - infoscreen-net @@ -141,7 +168,18 @@ services: environment: # HINZUGEFÜGT: Datenbank-Verbindungsstring DB_CONN: "mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}" - MQTT_PORT: 1883 + MQTT_BROKER_HOST: ${MQTT_BROKER_HOST:-mqtt} + MQTT_BROKER_PORT: ${MQTT_BROKER_PORT:-1883} + MQTT_USER: ${MQTT_USER} + MQTT_PASSWORD: ${MQTT_PASSWORD} + POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-30} + POWER_INTENT_PUBLISH_ENABLED: ${POWER_INTENT_PUBLISH_ENABLED:-false} + POWER_INTENT_HEARTBEAT_ENABLED: ${POWER_INTENT_HEARTBEAT_ENABLED:-true} + POWER_INTENT_EXPIRY_MULTIPLIER: ${POWER_INTENT_EXPIRY_MULTIPLIER:-3} + POWER_INTENT_MIN_EXPIRY_SECONDS: ${POWER_INTENT_MIN_EXPIRY_SECONDS:-90} + CRASH_RECOVERY_ENABLED: ${CRASH_RECOVERY_ENABLED:-false} + CRASH_RECOVERY_GRACE_SECONDS: ${CRASH_RECOVERY_GRACE_SECONDS:-180} + CRASH_RECOVERY_LOCKOUT_MINUTES: ${CRASH_RECOVERY_LOCKOUT_MINUTES:-15} networks: - infoscreen-net diff --git a/docker-compose.yml b/docker-compose.yml index c1923c1..de69ac1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,6 +19,10 @@ services: - DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} - DB_URL=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} - API_BASE_URL=http://server:8000 + - MQTT_BROKER_HOST=${MQTT_BROKER_HOST:-mqtt} + - MQTT_BROKER_PORT=${MQTT_BROKER_PORT:-1883} + - MQTT_USER=${MQTT_USER} + - MQTT_PASSWORD=${MQTT_PASSWORD} - ENV=${ENV:-development} - FLASK_SECRET_KEY=${FLASK_SECRET_KEY:-dev-secret-key-change-in-production} - DEFAULT_SUPERADMIN_USERNAME=${DEFAULT_SUPERADMIN_USERNAME:-superadmin} @@ -70,6 +74,17 @@ services: image: eclipse-mosquitto:2.0.21 # ✅ GUT: Version ist bereits spezifisch container_name: infoscreen-mqtt restart: unless-stopped + command: > + sh -c 'set -eu; + : "$${MQTT_USER:?MQTT_USER not set}"; + : "$${MQTT_PASSWORD:?MQTT_PASSWORD not set}"; + touch /mosquitto/config/passwd; + chmod 600 /mosquitto/config/passwd; + mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_USER}" "$${MQTT_PASSWORD}"; + if [ -n "$${MQTT_CANARY_USER:-}" ] && [ -n "$${MQTT_CANARY_PASSWORD:-}" ]; then + mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_CANARY_USER}" "$${MQTT_CANARY_PASSWORD}"; + fi; + exec mosquitto -c /mosquitto/config/mosquitto.conf' volumes: - ./mosquitto/config:/mosquitto/config - ./mosquitto/data:/mosquitto/data @@ -77,13 +92,18 @@ services: ports: - "1883:1883" # Standard MQTT - "9001:9001" # WebSocket (falls benötigt) + environment: + - MQTT_USER=${MQTT_USER} + - MQTT_PASSWORD=${MQTT_PASSWORD} + - MQTT_CANARY_USER=${MQTT_CANARY_USER:-} + - MQTT_CANARY_PASSWORD=${MQTT_CANARY_PASSWORD:-} networks: - infoscreen-net healthcheck: test: [ "CMD-SHELL", - "mosquitto_pub -h localhost -t test -m 'health' || exit 1", + "mosquitto_pub -h localhost -u $$MQTT_USER -P $$MQTT_PASSWORD -t test -m 'health' || exit 1", ] interval: 30s timeout: 5s @@ -169,13 +189,18 @@ services: environment: # HINZUGEFÜGT: Datenbank-Verbindungsstring - DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} - - MQTT_BROKER_URL=mqtt - - MQTT_PORT=1883 + - MQTT_BROKER_HOST=${MQTT_BROKER_HOST:-mqtt} + - MQTT_BROKER_PORT=${MQTT_BROKER_PORT:-1883} + - MQTT_USER=${MQTT_USER} + - MQTT_PASSWORD=${MQTT_PASSWORD} - POLL_INTERVAL_SECONDS=${POLL_INTERVAL_SECONDS:-30} - POWER_INTENT_PUBLISH_ENABLED=${POWER_INTENT_PUBLISH_ENABLED:-false} - POWER_INTENT_HEARTBEAT_ENABLED=${POWER_INTENT_HEARTBEAT_ENABLED:-true} - POWER_INTENT_EXPIRY_MULTIPLIER=${POWER_INTENT_EXPIRY_MULTIPLIER:-3} - POWER_INTENT_MIN_EXPIRY_SECONDS=${POWER_INTENT_MIN_EXPIRY_SECONDS:-90} + - CRASH_RECOVERY_ENABLED=${CRASH_RECOVERY_ENABLED:-false} + - CRASH_RECOVERY_GRACE_SECONDS=${CRASH_RECOVERY_GRACE_SECONDS:-180} + - CRASH_RECOVERY_LOCKOUT_MINUTES=${CRASH_RECOVERY_LOCKOUT_MINUTES:-15} networks: - infoscreen-net diff --git a/implementation-plans/reboot-command-payload-schemas.json b/implementation-plans/reboot-command-payload-schemas.json new file mode 100644 index 0000000..5f96392 --- /dev/null +++ b/implementation-plans/reboot-command-payload-schemas.json @@ -0,0 +1,149 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://infoscreen.local/schemas/reboot-command-payload-schemas.json", + "title": "Infoscreen Reboot Command Payload Schemas", + "description": "Frozen v1 schemas for per-client command and command acknowledgement payloads.", + "$defs": { + "commandPayloadV1": { + "type": "object", + "additionalProperties": false, + "required": [ + "schema_version", + "command_id", + "client_uuid", + "action", + "issued_at", + "expires_at", + "requested_by", + "reason" + ], + "properties": { + "schema_version": { + "type": "string", + "const": "1.0" + }, + "command_id": { + "type": "string", + "format": "uuid" + }, + "client_uuid": { + "type": "string", + "format": "uuid" + }, + "action": { + "type": "string", + "enum": [ + "reboot_host", + "shutdown_host" + ] + }, + "issued_at": { + "type": "string", + "format": "date-time" + }, + "expires_at": { + "type": "string", + "format": "date-time" + }, + "requested_by": { + "type": [ + "integer", + "null" + ], + "minimum": 1 + }, + "reason": { + "type": [ + "string", + "null" + ], + "maxLength": 2000 + } + } + }, + "commandAckPayloadV1": { + "type": "object", + "additionalProperties": false, + "required": [ + "command_id", + "status", + "error_code", + "error_message" + ], + "properties": { + "command_id": { + "type": "string", + "format": "uuid" + }, + "status": { + "type": "string", + "enum": [ + "accepted", + "execution_started", + "completed", + "failed" + ] + }, + "error_code": { + "type": [ + "string", + "null" + ], + "maxLength": 128 + }, + "error_message": { + "type": [ + "string", + "null" + ], + "maxLength": 4000 + } + }, + "allOf": [ + { + "if": { + "properties": { + "status": { + "const": "failed" + } + } + }, + "then": { + "properties": { + "error_code": { + "type": "string", + "minLength": 1 + }, + "error_message": { + "type": "string", + "minLength": 1 + } + } + } + } + ] + } + }, + "examples": [ + { + "commandPayloadV1": { + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": "2026-04-03T12:48:10Z", + "expires_at": "2026-04-03T12:52:10Z", + "requested_by": 1, + "reason": "operator_request" + } + }, + { + "commandAckPayloadV1": { + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "status": "execution_started", + "error_code": null, + "error_message": null + } + } + ] +} diff --git a/implementation-plans/reboot-command-payload-schemas.md b/implementation-plans/reboot-command-payload-schemas.md new file mode 100644 index 0000000..7612eed --- /dev/null +++ b/implementation-plans/reboot-command-payload-schemas.md @@ -0,0 +1,59 @@ +## Reboot Command Payload Schema Snippets + +This file provides copy-ready validation snippets for client and integration test helpers. + +### Canonical Topics (v1) +1. Command topic: infoscreen/{client_uuid}/commands +2. Ack topic: infoscreen/{client_uuid}/commands/ack + +### Transitional Compatibility Topics +1. Command topic alias: infoscreen/{client_uuid}/command +2. Ack topic alias: infoscreen/{client_uuid}/command/ack + +### Canonical Action Values +1. reboot_host +2. shutdown_host + +### Ack Status Values +1. accepted +2. execution_started +3. completed +4. failed + +### JSON Schema Source +Use this file for machine validation: +1. implementation-plans/reboot-command-payload-schemas.json + +### Minimal Command Schema Snippet +```json +{ + "type": "object", + "additionalProperties": false, + "required": ["schema_version", "command_id", "client_uuid", "action", "issued_at", "expires_at", "requested_by", "reason"], + "properties": { + "schema_version": { "const": "1.0" }, + "command_id": { "type": "string", "format": "uuid" }, + "client_uuid": { "type": "string", "format": "uuid" }, + "action": { "enum": ["reboot_host", "shutdown_host"] }, + "issued_at": { "type": "string", "format": "date-time" }, + "expires_at": { "type": "string", "format": "date-time" }, + "requested_by": { "type": ["integer", "null"] }, + "reason": { "type": ["string", "null"] } + } +} +``` + +### Minimal Ack Schema Snippet +```json +{ + "type": "object", + "additionalProperties": false, + "required": ["command_id", "status", "error_code", "error_message"], + "properties": { + "command_id": { "type": "string", "format": "uuid" }, + "status": { "enum": ["accepted", "execution_started", "completed", "failed"] }, + "error_code": { "type": ["string", "null"] }, + "error_message": { "type": ["string", "null"] } + } +} +``` diff --git a/implementation-plans/reboot-implementation-handoff-client-team.md b/implementation-plans/reboot-implementation-handoff-client-team.md new file mode 100644 index 0000000..f8c984b --- /dev/null +++ b/implementation-plans/reboot-implementation-handoff-client-team.md @@ -0,0 +1,146 @@ +## Client Team Implementation Spec (Raspberry Pi 5) + +### Mission +Implement client-side command handling for reliable restart and shutdown with strict validation, idempotency, acknowledgements, and reboot recovery continuity. + +### Ownership Boundaries +1. Client team owns command intake, execution, acknowledgement emission, and post-reboot continuity. +2. Platform team owns command issuance, lifecycle aggregation, and server-side escalation logic. +3. Client implementation must not assume managed PoE availability. + +### Required Client Behaviors + +### Frozen MQTT Topics and Schemas (v1) +1. Canonical command topic: infoscreen/{client_uuid}/commands. +2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack. +3. Transitional compatibility topics during migration: +- infoscreen/{client_uuid}/command +- infoscreen/{client_uuid}/command/ack +4. QoS policy: command QoS 1, ack QoS 1 recommended. +5. Retain policy: commands and acks are non-retained. + +Frozen command payload schema: + +```json +{ + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": "2026-04-03T12:48:10Z", + "expires_at": "2026-04-03T12:52:10Z", + "requested_by": 1, + "reason": "operator_request" +} +``` + +Frozen ack payload schema: + +```json +{ + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "status": "execution_started", + "error_code": null, + "error_message": null +} +``` + +Allowed ack status values: +1. accepted +2. execution_started +3. completed +4. failed + +Frozen command action values for v1: +1. reboot_host +2. shutdown_host + +Reserved but not emitted by server in v1: +1. restart_service + +Validation snippets for helper scripts: +1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md +2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json + +### 1. Command Intake +1. Subscribe to the canonical command topic with QoS 1. +2. Parse required fields: schema_version, command_id, action, issued_at, expires_at, reason, requested_by, target metadata. +3. Reject invalid payloads with failed acknowledgement including error_code and diagnostic message. +4. Reject stale commands when current time exceeds expires_at. +5. Ignore already-processed command_id values. + +### 2. Idempotency And Persistence +1. Persist processed command_id and execution result on local storage. +2. Persistence must survive service restart and full OS reboot. +3. On restart, reload dedupe cache before processing newly delivered commands. + +### 3. Acknowledgement Contract Behavior +1. Emit accepted immediately after successful validation and dedupe pass. +2. Emit execution_started immediately before invoking the command action. +3. Emit completed only when local success condition is confirmed. +4. Emit failed with structured error_code on validation or execution failure. +5. If MQTT is temporarily unavailable, retry ack publish with bounded backoff until command expiry. + +### 4. Execution Security Model +1. Execute via systemd-managed privileged helper. +2. Allow only whitelisted operations: +- reboot_host +- shutdown_host +3. Optionally keep restart_service handler as reserved path, but do not require it for v1 conformance. +4. Disallow arbitrary shell commands and untrusted arguments. +5. Enforce per-command execution timeout and terminate hung child processes. + +### 5. Reboot Recovery Continuity +1. For reboot_host action: +- send execution_started +- trigger reboot promptly +2. During startup: +- emit heartbeat early +- emit process-health once service is ready +3. Keep last command execution state available after reboot for reconciliation. + +### 6. Time And Timeout Semantics +1. Use monotonic timers for local elapsed-time checks. +2. Use UTC wall-clock only for protocol timestamps and expiry comparisons. +3. Target reconnect baseline on Pi 5 USB-SATA SSD: 90 seconds. +4. Accept cold-boot and USB enumeration ceiling up to 150 seconds. + +### 7. Capability Reporting +1. Report recovery capability class: +- software_only +- managed_poe_available +- manual_only +2. Report watchdog enabled status. +3. Report boot-source metadata for diagnostics. + +### 8. Error Codes Minimum Set +1. invalid_schema +2. missing_field +3. stale_command +4. duplicate_command +5. permission_denied_local +6. execution_timeout +7. execution_failed +8. broker_unavailable +9. internal_error + +### Acceptance Tests (Client Team) +1. Invalid schema payload is rejected and failed ack emitted. +2. Expired command is rejected and not executed. +3. Duplicate command_id is not executed twice. +4. reboot_host emits execution_started and reconnects with heartbeat in expected window. +5. restart_service action completes without host reboot and emits completed. +6. MQTT outage during ack path retries correctly without duplicate execution. +7. Boot-loop protection cooperates with server-side lockout semantics. + +### Delivery Artifacts +1. Client protocol conformance checklist. +2. Test evidence for all acceptance tests. +3. Runtime logs showing full lifecycle for one restart and one reboot scenario. +4. Known limitations list per image version. + +### Definition Of Done +1. All acceptance tests pass on Pi 5 USB-SATA SSD test devices. +2. No duplicate execution observed under reconnect and retained-delivery edge cases. +3. Acknowledgement sequence is complete and machine-parseable for server correlation. +4. Reboot recovery continuity works without managed PoE dependencies. diff --git a/implementation-plans/reboot-implementation-handoff-share.md b/implementation-plans/reboot-implementation-handoff-share.md new file mode 100644 index 0000000..aaa6505 --- /dev/null +++ b/implementation-plans/reboot-implementation-handoff-share.md @@ -0,0 +1,214 @@ +## Remote Reboot Reliability Handoff (Share Document) + +### Purpose +This document defines the agreed implementation scope for reliable remote reboot and shutdown of Raspberry Pi 5 clients, with monitoring-first visibility and safe escalation paths. + +### Scope +1. In scope: restart and shutdown command reliability. +2. In scope: full lifecycle monitoring and audit visibility. +3. In scope: capability-tier recovery model with optional managed PoE escalation. +4. Out of scope: broader maintenance module in client-management for this cycle. +5. Out of scope: mandatory dependency on customer-managed power switching. + +### Agreed Operating Model +1. Command delivery is asynchronous and lifecycle-tracked, not fire-and-forget. +2. Commands use idempotent command_id semantics with stale-command rejection by expires_at. +3. Monitoring is authoritative for operational state and escalation decisions. +4. Recovery must function even when no managed power switching is available. + +### Frozen Contract v1 (Effective Immediately) +1. Canonical command topic: infoscreen/{client_uuid}/commands. +2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack. +3. Transitional compatibility topics accepted during migration: +- infoscreen/{client_uuid}/command +- infoscreen/{client_uuid}/command/ack +4. QoS policy: command QoS 1, ack QoS 1 recommended. +5. Retain policy: commands and acks are non-retained. + +Command payload schema (frozen): + +```json +{ + "schema_version": "1.0", + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "action": "reboot_host", + "issued_at": "2026-04-03T12:48:10Z", + "expires_at": "2026-04-03T12:52:10Z", + "requested_by": 1, + "reason": "operator_request" +} +``` + +Ack payload schema (frozen): + +```json +{ + "command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4", + "status": "execution_started", + "error_code": null, + "error_message": null +} +``` + +Allowed ack status values: +1. accepted +2. execution_started +3. completed +4. failed + +Frozen command action values: +1. reboot_host +2. shutdown_host + +API endpoint mapping: +1. POST /api/clients/{uuid}/restart -> action reboot_host +2. POST /api/clients/{uuid}/shutdown -> action shutdown_host + +Validation snippets: +1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md +2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json + +### Command Lifecycle States +1. queued +2. publish_in_progress +3. published +4. ack_received +5. execution_started +6. awaiting_reconnect +7. recovered +8. completed +9. failed +10. expired +11. timed_out +12. canceled +13. blocked_safety +14. manual_intervention_required + +### Timeout Defaults (Pi 5, USB-SATA SSD baseline) +1. queued to publish_in_progress: immediate, timeout 5 seconds. +2. publish_in_progress to published: timeout 8 seconds. +3. published to ack_received: timeout 20 seconds. +4. ack_received to execution_started: 15 seconds for service restart, 25 seconds for host reboot. +5. execution_started to awaiting_reconnect: timeout 10 seconds. +6. awaiting_reconnect to recovered: baseline 90 seconds after validation, cold-boot ceiling 150 seconds. +7. recovered to completed: 15 to 20 seconds based on fleet stability. +8. command expires_at default: 240 seconds, bounded 180 to 360 seconds. + +### Recovery Tiers +1. Tier 0 baseline, always required: watchdog, systemd auto-restart, lifecycle tracking, manual intervention fallback. +2. Tier 1 optional: managed PoE per-port power-cycle escalation where customer infrastructure supports it. +3. Tier 2 no remote power control: direct manual intervention workflow. + +### Governance And Safety +1. Role access: admin and superadmin. +2. Bulk actions require reason capture. +3. Safety lockout: maximum 3 reboot commands per client in 15 minutes. +4. Escalation cooldown: 60 seconds before automatic move to manual_intervention_required. + +### MQTT Auth Hardening (Phase 1, Required Before Broad Rollout) +1. Intranet-only deployment is not sufficient protection for privileged MQTT actions by itself. +2. Phase 1 hardening scope is broker authentication, authorization, and network restriction; payload URL allowlisting is deferred to a later client/server feature. +3. MQTT broker must disable anonymous publish/subscribe access in production. +4. MQTT broker must require authenticated identities for server-side publishers and client devices. +5. MQTT broker must enforce ACLs so that: +- only server-side services can publish to `infoscreen/{client_uuid}/commands` +- only server-side services can publish scheduler event topics +- each client can subscribe only to its own command topics and assigned event topics +- each client can publish only its own ack, heartbeat, health, dashboard, and telemetry topics +6. Broker port exposure must be restricted to the management network and approved hosts only. +7. TLS support is strongly recommended in this phase and should be enabled when operationally feasible. + +### Server Team Actions For Auth Hardening +1. Provision broker credentials for command/event publishers and for client devices. +2. Configure Mosquitto or equivalent broker ACLs for per-topic publish and subscribe restrictions. +3. Disable anonymous access on production brokers. +4. Restrict broker network exposure with firewall rules, VLAN policy, or equivalent network controls. +5. Update server/frontend deployment to publish MQTT with authenticated credentials. +6. Validate that server-side event publishing and reboot/shutdown command publishing still work under the new ACL policy. +7. Coordinate credential distribution and rotation with the client deployment process. + +### MQTT ACL Matrix (Canonical Baseline) +| Actor | Topic Pattern | Publish | Subscribe | Notes | +| --- | --- | --- | --- | --- | +| scheduler-service | infoscreen/events/+ | Yes | No | Publishes retained active event list per group. | +| api-command-publisher | infoscreen/+/commands | Yes | No | Publishes canonical reboot/shutdown commands. | +| api-command-publisher | infoscreen/+/command | Yes | No | Transitional compatibility publish only. | +| api-group-assignment | infoscreen/+/group_id | Yes | No | Publishes retained client-to-group assignment. | +| listener-service | infoscreen/+/commands/ack | No | Yes | Consumes canonical client command acknowledgements. | +| listener-service | infoscreen/+/command/ack | No | Yes | Consumes transitional compatibility acknowledgements. | +| listener-service | infoscreen/+/heartbeat | No | Yes | Consumes heartbeat telemetry. | +| listener-service | infoscreen/+/health | No | Yes | Consumes health telemetry. | +| listener-service | infoscreen/+/dashboard | No | Yes | Consumes dashboard screenshot payloads. | +| listener-service | infoscreen/+/screenshot | No | Yes | Consumes screenshot payloads (if enabled). | +| listener-service | infoscreen/+/logs/error | No | Yes | Consumes client error logs. | +| listener-service | infoscreen/+/logs/warn | No | Yes | Consumes client warn logs. | +| listener-service | infoscreen/+/logs/info | No | Yes | Consumes client info logs. | +| listener-service | infoscreen/discovery | No | Yes | Consumes discovery announcements. | +| listener-service | infoscreen/+/discovery_ack | Yes | No | Publishes discovery acknowledgements. | +| client- | infoscreen//commands | No | Yes | Canonical command intake for this client only. | +| client- | infoscreen//command | No | Yes | Transitional compatibility intake for this client only. | +| client- | infoscreen/events/ | No | Yes | Assigned group event feed only; dynamic per assignment. | +| client- | infoscreen//commands/ack | Yes | No | Canonical command acknowledgements for this client only. | +| client- | infoscreen//command/ack | Yes | No | Transitional compatibility acknowledgements for this client only. | +| client- | infoscreen//heartbeat | Yes | No | Heartbeat telemetry. | +| client- | infoscreen//health | Yes | No | Health telemetry. | +| client- | infoscreen//dashboard | Yes | No | Dashboard status and screenshot payloads. | +| client- | infoscreen//screenshot | Yes | No | Screenshot payloads (if enabled). | +| client- | infoscreen//logs/error | Yes | No | Error log stream. | +| client- | infoscreen//logs/warn | Yes | No | Warning log stream. | +| client- | infoscreen//logs/info | Yes | No | Info log stream. | +| client- | infoscreen/discovery | Yes | No | Discovery announcement. | +| client- | infoscreen//discovery_ack | No | Yes | Discovery acknowledgment from listener. | + +ACL implementation notes: +1. Use per-client identities; client ACLs must be scoped to exact client UUID and must not allow wildcard access to other clients. +2. Event topic subscription (`infoscreen/events/`) should be managed via broker-side ACL provisioning that updates when group assignment changes. +3. Transitional singular command topics are temporary and should be removed after migration cutover. +4. Deny by default: any topic not explicitly listed above should be blocked for each actor. + +### Credential Management Guidance +1. Real MQTT passwords must not be stored in tracked documentation or committed templates. +2. Each client device should receive a unique broker username and password, stored only in its local [/.env](.env). +3. Server-side publisher credentials should be stored in the server team's secret-management path, not in source control. +4. Recommended naming convention for client broker users: `infoscreen-client-`. +5. Client passwords should be random, at least 20 characters, and rotated through deployment tooling or broker administration procedures. +6. The server/infrastructure team owns broker-side user creation, ACL assignment, rotation, and revocation. +7. The client team owns loading credentials from local env files and validating connection behavior against the secured broker. + +### Client Team Actions For Auth Hardening +1. Add MQTT username/password support in the client connection setup. +2. Add client-side TLS configuration support from environment when certificates are provided. +3. Update local test helpers to support authenticated MQTT publishing and subscription. +4. Validate command and event intake against the authenticated broker configuration before canary rollout. + +### Ready For Server/Frontend Team (Auth Phase) +1. Client implementation is ready to connect with MQTT auth from local `.env` (`MQTT_USERNAME`, `MQTT_PASSWORD`, optional TLS settings). +2. Client command/event intake and client ack/telemetry publishing run over the authenticated MQTT session. +3. Server/frontend team must now complete broker-side enforcement and publisher migration. + +Server/frontend done criteria: +1. Anonymous broker access is disabled in production. +2. Server-side publishers use authenticated broker credentials. +3. ACLs are active and validated for command, event, and client telemetry topics. +4. At least one canary client proves end-to-end flow under ACLs: +- server publishes command/event with authenticated publisher +- client receives payload +- client sends ack/telemetry successfully +5. Revocation test passes: disabling one client credential blocks only that client without impacting others. + +Operational note: +1. Client-side auth support is necessary but not sufficient by itself; broker ACL/auth enforcement is the security control that must be enabled by the server/infrastructure team. + +### Rollout Plan +1. Contract freeze and sign-off. +2. Platform and client implementation against frozen schemas. +3. One-group canary. +4. Rollback if failed plus timed_out exceeds 5 percent. +5. Expand only after 7 days below intervention threshold. + +### Success Criteria +1. Deterministic command lifecycle visibility from enqueue to completion. +2. No duplicate execution under reconnect or delayed-delivery conditions. +3. Stable Pi 5 SSD reconnect behavior within defined baseline. +4. Clear and actionable manual intervention states when automatic recovery is exhausted. diff --git a/implementation-plans/reboot-kickoff-summary.md b/implementation-plans/reboot-kickoff-summary.md new file mode 100644 index 0000000..c739640 --- /dev/null +++ b/implementation-plans/reboot-kickoff-summary.md @@ -0,0 +1,54 @@ +## Reboot Reliability Kickoff Summary + +### Objective +Ship a reliable, observable restart and shutdown workflow for Raspberry Pi 5 clients, with safe escalation and clear operator outcomes. + +### What Is Included +1. Asynchronous command lifecycle with idempotent command_id handling. +2. Monitoring-first state visibility from queued to terminal outcomes. +3. Client acknowledgements for accepted, execution_started, completed, and failed. +4. Pi 5 USB-SATA SSD timeout baseline and tuning rules. +5. Capability-tier recovery with optional managed PoE escalation. + +### What Is Not Included +1. Full maintenance module in client-management. +2. Required managed power-switch integration. +3. Production Wake-on-LAN rollout. + +### Team Split +1. Platform team: API command lifecycle, safety controls, listener ack ingestion. +2. Web team: lifecycle-aware UX and command status display. +3. Client team: strict validation, dedupe, ack sequence, secure execution helper, reboot continuity. + +### Ownership Matrix +| Team | Primary Plan File | Main Deliverables | +| --- | --- | --- | +| Platform team | implementation-plans/reboot-implementation-handoff-share.md | Command lifecycle backend, policy enforcement, listener ack mapping, safety lockout and escalation | +| Web team | implementation-plans/reboot-implementation-handoff-share.md | Lifecycle UI states, bulk safety UX, capability visibility, command status polling | +| Client team | implementation-plans/reboot-implementation-handoff-client-team.md | Command validation, dedupe persistence, ack sequence, secure execution helper, reboot continuity | +| Project coordination | implementation-plans/reboot-kickoff-summary.md | Phase sequencing, canary gates, rollback thresholds, cross-team sign-off tracking | + +### Baseline Operational Defaults +1. Safety lockout: 3 reboot commands per client in rolling 15 minutes. +2. Escalation cooldown: 60 seconds. +3. Reconnect target on Pi 5 SSD: 90 seconds baseline, 150 seconds cold-boot ceiling. +4. Rollback canary trigger: failed plus timed_out above 5 percent. + +### Frozen Contract Snapshot +1. Canonical command topic: infoscreen/{client_uuid}/commands. +2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack. +3. Transitional compatibility topics during migration: +- infoscreen/{client_uuid}/command +- infoscreen/{client_uuid}/command/ack +4. Command schema version: 1.0. +5. Allowed command actions: reboot_host, shutdown_host. +6. Allowed ack status values: accepted, execution_started, completed, failed. +7. Validation snippets: +- implementation-plans/reboot-command-payload-schemas.md +- implementation-plans/reboot-command-payload-schemas.json + +### Immediate Next Steps +1. Continue implementation in parallel by team against frozen contract. +2. Client team validates dedupe and expiry handling on canonical topics. +3. Platform team verifies ack-state transitions for accepted, execution_started, completed, failed. +4. Execute one-group canary and validate timing plus failure drills. diff --git a/implementation-plans/server-team-actions.md b/implementation-plans/server-team-actions.md new file mode 100644 index 0000000..a28faa8 --- /dev/null +++ b/implementation-plans/server-team-actions.md @@ -0,0 +1,127 @@ +# Server Team Action Items — Infoscreen Client + +This document lists everything the server/infrastructure/frontend team must implement to complete the client integration. The client-side code is production-ready for all items listed here. + +--- + +## 1. MQTT Broker Hardening (prerequisite for everything else) + +- Disable anonymous access on the broker. +- Create one broker account **per client device**: + - Username convention: `infoscreen-client-` (e.g. `infoscreen-client-9b8d1856`) + - Provision the password to the device `.env` as `MQTT_PASSWORD_BROKER=` +- Create a **server/publisher account** (e.g. `infoscreen-server`) for all server-side publishes. +- Enforce ACLs: + +| Topic | Publisher | +|---|---| +| `infoscreen/{uuid}/commands` | server only | +| `infoscreen/{uuid}/command` (alias) | server only | +| `infoscreen/{uuid}/group_id` | server only | +| `infoscreen/events/{group_id}` | server only | +| `infoscreen/groups/+/power/intent` | server only | +| `infoscreen/{uuid}/commands/ack` | client only | +| `infoscreen/{uuid}/command/ack` | client only | +| `infoscreen/{uuid}/heartbeat` | client only | +| `infoscreen/{uuid}/health` | client only | +| `infoscreen/{uuid}/logs/#` | client only | +| `infoscreen/{uuid}/service_failed` | client only | + +--- + +## 2. Reboot / Shutdown Command — Ack Lifecycle + +Client publishes ack status updates to two topics per command (canonical + transitional alias): +- `infoscreen/{uuid}/commands/ack` +- `infoscreen/{uuid}/command/ack` + +**Ack payload schema (v1, frozen):** +```json +{ + "command_id": "07aab032-53c2-45ef-a5a3-6aa58e9d9fae", + "status": "accepted | execution_started | completed | failed", + "error_code": null, + "error_message": null +} +``` + +**Status lifecycle:** + +| Status | When | Notes | +|---|---|---| +| `accepted` | Command received and validated | Immediate | +| `execution_started` | Helper invoked | Immediate after accepted | +| `completed` | Execution confirmed | For `reboot_host`: arrives after reconnect (10–90 s after `execution_started`) | +| `failed` | Helper returned error | `error_code` and `error_message` will be set | + +**Server must:** +- Track `command_id` through the full lifecycle and update status in DB/UI. +- Surface `failed` + `error_code` to the operator UI. +- Expect `reboot_host` `completed` to arrive after a reconnect delay — do not treat the gap as a timeout. +- Use `expires_at` from the original command to determine when to abandon waiting. + +--- + +## 3. Health Dashboard — Broker Connection Fields (Gap 2) + +Every `infoscreen/{uuid}/health` payload now includes a `broker_connection` block: + +```json +{ + "timestamp": "2026-04-05T08:00:00.000000+00:00", + "expected_state": { "event_id": 42 }, + "actual_state": { + "process": "display_manager", + "pid": 1234, + "status": "running" + }, + "broker_connection": { + "broker_reachable": true, + "reconnect_count": 2, + "last_disconnect_at": "2026-04-04T10:30:00Z" + } +} +``` + +**Server must:** +- Display `reconnect_count` and `last_disconnect_at` per device in the health dashboard. +- Implement alerting heuristic: + - **All** clients go silent simultaneously → likely broker outage, not device crash. + - **Single** client goes silent → device crash, network failure, or process hang. + +--- + +## 4. Service-Failed MQTT Notification (Gap 3) + +When systemd gives up restarting a service after repeated crashes (`StartLimitBurst` exceeded), the client automatically publishes a **retained** message: + +**Topic:** `infoscreen/{uuid}/service_failed` + +**Payload:** +```json +{ + "event": "service_failed", + "unit": "infoscreen-simclient.service", + "client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "failed_at": "2026-04-05T08:00:00Z" +} +``` + +**Server must:** +- Subscribe to `infoscreen/+/service_failed` on startup (retained — message survives broker restart). +- Alert the operator immediately when this topic receives a payload. +- **Clear the retained message** once the device is acknowledged or recovered: + ``` + mosquitto_pub -t "infoscreen/{uuid}/service_failed" -n --retain + ``` + +--- + +## 5. No Server Action Required + +These items are fully implemented client-side and require no server changes: + +- systemd watchdog (`WatchdogSec=60`) — hangs detected and process restarted automatically. +- Command deduplication — `command_id` deduplicated with 24-hour TTL. +- Ack retry backoff — client retries ack publish on broker disconnect until `expires_at`. +- Mock helper / test mode (`COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE`) — development only. diff --git a/listener/listener.py b/listener/listener.py index d730e6d..ef6b8fb 100644 --- a/listener/listener.py +++ b/listener/listener.py @@ -4,11 +4,12 @@ import logging import datetime import base64 import re +import ssl import requests import paho.mqtt.client as mqtt from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from models.models import Client, ClientLog, LogLevel, ProcessStatus, ScreenHealthStatus +from models.models import Client, ClientLog, ClientCommand, LogLevel, ProcessStatus, ScreenHealthStatus logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s') # Load .env only when not already configured by Docker (API_BASE_URL not set by compose means we're outside a container) @@ -32,6 +33,16 @@ Session = sessionmaker(bind=engine) # API configuration API_BASE_URL = os.getenv("API_BASE_URL", "http://server:8000") +MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", "mqtt") +MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883"))) +MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME") +MQTT_PASSWORD = os.getenv("MQTT_PASSWORD") +MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on") +MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT") +MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE") +MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE") +MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on") + # Dashboard payload migration observability DASHBOARD_METRICS_LOG_EVERY = int(os.getenv("DASHBOARD_METRICS_LOG_EVERY", "5")) DASHBOARD_PARSE_METRICS = { @@ -376,8 +387,11 @@ def on_connect(client, userdata, flags, reasonCode, properties): client.subscribe("infoscreen/+/logs/warn") client.subscribe("infoscreen/+/logs/info") client.subscribe("infoscreen/+/health") + client.subscribe("infoscreen/+/commands/ack") + client.subscribe("infoscreen/+/command/ack") + client.subscribe("infoscreen/+/service_failed") - logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, and health") + logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, health, and service_failed") except Exception as e: logging.error(f"Subscribe failed on connect: {e}") @@ -387,6 +401,72 @@ def on_message(client, userdata, msg): logging.debug(f"Empfangene Nachricht auf Topic: {topic}") try: + # Command acknowledgement handling + if topic.startswith("infoscreen/") and (topic.endswith("/commands/ack") or topic.endswith("/command/ack")): + uuid = topic.split("/")[1] + try: + payload = json.loads(msg.payload.decode()) + except (json.JSONDecodeError, UnicodeDecodeError): + logging.error(f"Ungueltiges Command-ACK Payload von {uuid}") + return + + command_id = payload.get("command_id") + ack_status = str(payload.get("status", "")).strip().lower() + error_code = payload.get("error_code") + error_message = payload.get("error_message") + + if not command_id: + logging.warning(f"Command-ACK ohne command_id von {uuid}") + return + + status_map = { + "accepted": "ack_received", + "execution_started": "execution_started", + "completed": "completed", + "failed": "failed", + } + mapped_status = status_map.get(ack_status) + if not mapped_status: + logging.warning(f"Unbekannter Command-ACK Status '{ack_status}' von {uuid}") + return + + db_session = Session() + try: + command_obj = db_session.query(ClientCommand).filter_by(command_id=command_id).first() + if not command_obj: + logging.warning(f"Command-ACK fuer unbekanntes command_id={command_id} von {uuid}") + return + + # Ignore stale/duplicate regressions. + terminal_states = {"completed", "failed", "expired", "canceled", "blocked_safety"} + if command_obj.status in terminal_states: + logging.info( + f"Command-ACK ignoriert (bereits terminal): command_id={command_id}, status={command_obj.status}" + ) + return + + now_utc = datetime.datetime.now(datetime.UTC) + command_obj.status = mapped_status + if mapped_status == "ack_received": + command_obj.acked_at = now_utc + elif mapped_status == "execution_started": + command_obj.execution_started_at = now_utc + elif mapped_status == "completed": + command_obj.completed_at = now_utc + elif mapped_status == "failed": + command_obj.failed_at = now_utc + command_obj.error_code = str(error_code) if error_code is not None else command_obj.error_code + command_obj.error_message = str(error_message) if error_message is not None else command_obj.error_message + + db_session.commit() + logging.info(f"Command-ACK verarbeitet: command_id={command_id}, status={mapped_status}, uuid={uuid}") + except Exception as e: + db_session.rollback() + logging.error(f"Fehler bei Command-ACK Verarbeitung ({command_id}): {e}") + finally: + db_session.close() + return + # Dashboard-Handling (nested screenshot payload) if topic.startswith("infoscreen/") and topic.endswith("/dashboard"): uuid = topic.split("/")[1] @@ -506,6 +586,43 @@ def on_message(client, userdata, msg): logging.error(f"Could not parse log payload from {uuid}: {e}") return + # Service-failed handling (systemd gave up restarting — retained message) + if topic.startswith("infoscreen/") and topic.endswith("/service_failed"): + uuid = topic.split("/")[1] + # Empty payload = retained message cleared; ignore it. + if not msg.payload: + logging.info(f"service_failed retained message cleared for {uuid}") + return + try: + payload_data = json.loads(msg.payload.decode()) + failed_at_str = payload_data.get("failed_at") + unit = payload_data.get("unit", "") + try: + failed_at = datetime.datetime.fromisoformat(failed_at_str.replace("Z", "+00:00")) if failed_at_str else datetime.datetime.now(datetime.UTC) + if failed_at.tzinfo is None: + failed_at = failed_at.replace(tzinfo=datetime.UTC) + except (ValueError, AttributeError): + failed_at = datetime.datetime.now(datetime.UTC) + + session = Session() + try: + client_obj = session.query(Client).filter_by(uuid=uuid).first() + if client_obj: + client_obj.service_failed_at = failed_at + client_obj.service_failed_unit = unit[:128] if unit else None + session.commit() + logging.warning(f"event=service_failed uuid={uuid} unit={unit} failed_at={failed_at.isoformat()}") + else: + logging.warning(f"service_failed received for unknown client uuid={uuid}") + except Exception as e: + session.rollback() + logging.error(f"Error persisting service_failed for {uuid}: {e}") + finally: + session.close() + except (json.JSONDecodeError, UnicodeDecodeError) as e: + logging.error(f"Could not parse service_failed payload from {uuid}: {e}") + return + # Health-Handling if topic.startswith("infoscreen/") and topic.endswith("/health"): uuid = topic.split("/")[1] @@ -531,6 +648,26 @@ def on_message(client, userdata, msg): screen_health_status=screen_health_status, last_screenshot_analyzed=parse_timestamp((payload_data.get('health_metrics') or {}).get('last_frame_update')), ) + + # Update broker connection health fields + broker_conn = payload_data.get('broker_connection') + if isinstance(broker_conn, dict): + reconnect_count = broker_conn.get('reconnect_count') + last_disconnect_str = broker_conn.get('last_disconnect_at') + if reconnect_count is not None: + try: + client_obj.mqtt_reconnect_count = int(reconnect_count) + except (ValueError, TypeError): + pass + if last_disconnect_str: + try: + last_disconnect = datetime.datetime.fromisoformat(last_disconnect_str.replace('Z', '+00:00')) + if last_disconnect.tzinfo is None: + last_disconnect = last_disconnect.replace(tzinfo=datetime.UTC) + client_obj.mqtt_last_disconnect_at = last_disconnect + except (ValueError, AttributeError): + pass + session.commit() logging.debug(f"Health update from {uuid}: {actual.get('process')} ({actual.get('status')})") session.close() @@ -589,9 +726,29 @@ def main(): mqtt_client.on_connect = on_connect # Set an exponential reconnect delay to survive broker restarts mqtt_client.reconnect_delay_set(min_delay=1, max_delay=60) - mqtt_client.connect("mqtt", 1883) - logging.info("Listener gestartet; warte auf MQTT-Verbindung und Nachrichten") + if MQTT_USERNAME and MQTT_PASSWORD: + mqtt_client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD) + + if MQTT_TLS_ENABLED: + mqtt_client.tls_set( + ca_certs=MQTT_TLS_CA_CERT, + certfile=MQTT_TLS_CERTFILE, + keyfile=MQTT_TLS_KEYFILE, + cert_reqs=ssl.CERT_REQUIRED, + ) + if MQTT_TLS_INSECURE: + mqtt_client.tls_insecure_set(True) + + mqtt_client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT) + + logging.info( + "Listener gestartet; warte auf MQTT-Verbindung und Nachrichten (host=%s port=%s tls=%s auth=%s)", + MQTT_BROKER_HOST, + MQTT_BROKER_PORT, + MQTT_TLS_ENABLED, + bool(MQTT_USERNAME and MQTT_PASSWORD), + ) mqtt_client.loop_forever() diff --git a/models/models.py b/models/models.py index 38e5e40..2a9ebeb 100644 --- a/models/models.py +++ b/models/models.py @@ -147,6 +147,14 @@ class Client(Base): screen_health_status = Column(Enum(ScreenHealthStatus), nullable=True, server_default='UNKNOWN') last_screenshot_hash = Column(String(32), nullable=True) + # Systemd service-failed tracking + service_failed_at = Column(TIMESTAMP(timezone=True), nullable=True) + service_failed_unit = Column(String(128), nullable=True) + + # MQTT broker connection health + mqtt_reconnect_count = Column(Integer, nullable=True) + mqtt_last_disconnect_at = Column(TIMESTAMP(timezone=True), nullable=True) + class ClientLog(Base): __tablename__ = 'client_logs' @@ -164,6 +172,33 @@ class ClientLog(Base): ) +class ClientCommand(Base): + __tablename__ = 'client_commands' + + id = Column(Integer, primary_key=True, autoincrement=True) + command_id = Column(String(36), nullable=False, unique=True, index=True) + client_uuid = Column(String(36), ForeignKey('clients.uuid', ondelete='CASCADE'), nullable=False, index=True) + action = Column(String(32), nullable=False, index=True) + status = Column(String(40), nullable=False, index=True) + reason = Column(Text, nullable=True) + requested_by = Column(Integer, ForeignKey('users.id', ondelete='SET NULL'), nullable=True, index=True) + issued_at = Column(TIMESTAMP(timezone=True), nullable=False) + expires_at = Column(TIMESTAMP(timezone=True), nullable=False) + published_at = Column(TIMESTAMP(timezone=True), nullable=True) + acked_at = Column(TIMESTAMP(timezone=True), nullable=True) + execution_started_at = Column(TIMESTAMP(timezone=True), nullable=True) + completed_at = Column(TIMESTAMP(timezone=True), nullable=True) + failed_at = Column(TIMESTAMP(timezone=True), nullable=True) + error_code = Column(String(64), nullable=True) + error_message = Column(Text, nullable=True) + created_at = Column(TIMESTAMP(timezone=True), server_default=func.current_timestamp(), nullable=False) + updated_at = Column(TIMESTAMP(timezone=True), server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=False) + + __table_args__ = ( + Index('ix_client_commands_client_status_created', 'client_uuid', 'status', 'created_at'), + ) + + class EventType(enum.Enum): presentation = "presentation" website = "website" diff --git a/scheduler/db_utils.py b/scheduler/db_utils.py index 35f0799..853a57d 100644 --- a/scheduler/db_utils.py +++ b/scheduler/db_utils.py @@ -1,13 +1,14 @@ # scheduler/db_utils.py from dotenv import load_dotenv import os -from datetime import datetime +from datetime import datetime, timedelta, timezone import hashlib import json import logging from sqlalchemy.orm import sessionmaker, joinedload from sqlalchemy import create_engine, or_, and_, text -from models.models import Event, EventMedia, EventException, SystemSetting +import uuid as _uuid_mod +from models.models import Event, EventMedia, EventException, SystemSetting, Client, ClientCommand, ProcessStatus from dateutil.rrule import rrulestr from urllib.request import Request, urlopen from datetime import timezone @@ -454,3 +455,167 @@ def format_event_with_media(event): # Add other event types (message, etc.) here as needed... return event_dict + + +# --------------------------------------------------------------------------- +# Crash detection / auto-recovery helpers +# --------------------------------------------------------------------------- + +_CRASH_RECOVERY_SCHEMA_VERSION = "1.0" +_CRASH_COMMAND_TOPIC = "infoscreen/{uuid}/commands" +_CRASH_COMMAND_COMPAT_TOPIC = "infoscreen/{uuid}/command" +_CRASH_RECOVERY_EXPIRY_SECONDS = int(os.getenv("CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS", "240")) +_CRASH_RECOVERY_LOCKOUT_MINUTES = int(os.getenv("CRASH_RECOVERY_LOCKOUT_MINUTES", "15")) + + +def get_crash_recovery_candidates(heartbeat_grace_seconds: int) -> list: + """ + Returns a list of dicts for active clients that are crashed (process_status=crashed) + or heartbeat-stale, and don't already have a recent recovery command in the lockout window. + """ + session = Session() + try: + now = datetime.now(timezone.utc) + stale_cutoff = now - timedelta(seconds=heartbeat_grace_seconds) + lockout_cutoff = now - timedelta(minutes=_CRASH_RECOVERY_LOCKOUT_MINUTES) + + candidates = ( + session.query(Client) + .filter(Client.is_active == True) + .filter( + or_( + Client.process_status == ProcessStatus.crashed, + Client.last_alive < stale_cutoff, + ) + ) + .all() + ) + + result = [] + for c in candidates: + recent = ( + session.query(ClientCommand) + .filter(ClientCommand.client_uuid == c.uuid) + .filter(ClientCommand.created_at >= lockout_cutoff) + .filter(ClientCommand.action.in_(["reboot_host", "restart_app"])) + .first() + ) + if recent: + continue + crash_reason = ( + "process_crashed" + if c.process_status == ProcessStatus.crashed + else "heartbeat_stale" + ) + result.append({ + "uuid": c.uuid, + "reason": crash_reason, + "process_status": c.process_status.value if c.process_status else None, + "last_alive": c.last_alive, + }) + return result + finally: + session.close() + + +def issue_crash_recovery_command(client_uuid: str, reason: str) -> tuple: + """ + Writes a ClientCommand (reboot_host) for crash recovery to the DB. + Returns (command_id, payload_dict) for the caller to publish over MQTT. + Also returns the MQTT topic strings. + """ + session = Session() + try: + now = datetime.now(timezone.utc) + expires_at = now + timedelta(seconds=_CRASH_RECOVERY_EXPIRY_SECONDS) + command_id = str(_uuid_mod.uuid4()) + + command = ClientCommand( + command_id=command_id, + client_uuid=client_uuid, + action="reboot_host", + status="queued", + reason=reason, + requested_by=None, + issued_at=now, + expires_at=expires_at, + ) + session.add(command) + session.commit() + command.status = "publish_in_progress" + session.commit() + + payload = { + "schema_version": _CRASH_RECOVERY_SCHEMA_VERSION, + "command_id": command_id, + "client_uuid": client_uuid, + "action": "reboot_host", + "issued_at": now.isoformat().replace("+00:00", "Z"), + "expires_at": expires_at.isoformat().replace("+00:00", "Z"), + "requested_by": None, + "reason": reason, + } + topic = _CRASH_COMMAND_TOPIC.format(uuid=client_uuid) + compat_topic = _CRASH_COMMAND_COMPAT_TOPIC.format(uuid=client_uuid) + return command_id, payload, topic, compat_topic + except Exception: + session.rollback() + raise + finally: + session.close() + + +def finalize_crash_recovery_command(command_id: str, published: bool, error: str = None) -> None: + """Updates command status after MQTT publish attempt.""" + session = Session() + try: + cmd = session.query(ClientCommand).filter_by(command_id=command_id).first() + if not cmd: + return + now = datetime.now(timezone.utc) + if published: + cmd.status = "published" + cmd.published_at = now + else: + cmd.status = "failed" + cmd.failed_at = now + cmd.error_code = "mqtt_publish_failed" + cmd.error_message = error or "Unknown publish error" + session.commit() + finally: + session.close() + + +_TERMINAL_COMMAND_STATUSES = {"completed", "failed", "expired", "canceled", "blocked_safety"} + + +def sweep_expired_commands() -> int: + """Marks non-terminal commands whose expires_at has passed as expired. + + Returns the number of commands updated. + """ + session = Session() + try: + now = datetime.now(timezone.utc) + commands = ( + session.query(ClientCommand) + .filter( + ClientCommand.expires_at < now, + ClientCommand.status.notin_(_TERMINAL_COMMAND_STATUSES), + ) + .all() + ) + if not commands: + return 0 + for cmd in commands: + cmd.status = "expired" + cmd.failed_at = now + cmd.error_code = "expired" + cmd.error_message = "Command expired before terminal state was reached." + session.commit() + return len(commands) + except Exception: + session.rollback() + raise + finally: + session.close() diff --git a/scheduler/scheduler.py b/scheduler/scheduler.py index 0c390c4..cd762cd 100644 --- a/scheduler/scheduler.py +++ b/scheduler/scheduler.py @@ -8,12 +8,28 @@ from .db_utils import ( compute_group_power_intent_basis, build_group_power_intent_body, compute_group_power_intent_fingerprint, + get_crash_recovery_candidates, + issue_crash_recovery_command, + finalize_crash_recovery_command, + sweep_expired_commands, ) import paho.mqtt.client as mqtt import json import datetime import time import uuid +import ssl + + +MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", os.getenv("MQTT_BROKER_URL", "mqtt")) +MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883"))) +MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME") +MQTT_PASSWORD = os.getenv("MQTT_PASSWORD") +MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on") +MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT") +MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE") +MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE") +MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on") def _to_utc_z(dt: datetime.datetime) -> str: @@ -224,6 +240,19 @@ def main(): client = mqtt.Client(callback_api_version=mqtt.CallbackAPIVersion.VERSION2) client.reconnect_delay_set(min_delay=1, max_delay=30) + if MQTT_USERNAME and MQTT_PASSWORD: + client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD) + + if MQTT_TLS_ENABLED: + client.tls_set( + ca_certs=MQTT_TLS_CA_CERT, + certfile=MQTT_TLS_CERTFILE, + keyfile=MQTT_TLS_KEYFILE, + cert_reqs=ssl.CERT_REQUIRED, + ) + if MQTT_TLS_INSECURE: + client.tls_insecure_set(True) + POLL_INTERVAL = int(os.getenv("POLL_INTERVAL_SECONDS", "30")) # 0 = aus; z.B. 600 für alle 10 Min # initial value from DB or fallback to env @@ -238,16 +267,21 @@ def main(): POWER_INTENT_HEARTBEAT_ENABLED = _env_bool("POWER_INTENT_HEARTBEAT_ENABLED", True) POWER_INTENT_EXPIRY_MULTIPLIER = int(os.getenv("POWER_INTENT_EXPIRY_MULTIPLIER", "3")) POWER_INTENT_MIN_EXPIRY_SECONDS = int(os.getenv("POWER_INTENT_MIN_EXPIRY_SECONDS", "90")) + CRASH_RECOVERY_ENABLED = _env_bool("CRASH_RECOVERY_ENABLED", False) + CRASH_RECOVERY_GRACE_SECONDS = int(os.getenv("CRASH_RECOVERY_GRACE_SECONDS", "180")) logging.info( "Scheduler config: poll_interval=%ss refresh_seconds=%s power_intent_enabled=%s " - "power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss", + "power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss " + "crash_recovery_enabled=%s crash_recovery_grace=%ss", POLL_INTERVAL, REFRESH_SECONDS, POWER_INTENT_PUBLISH_ENABLED, POWER_INTENT_HEARTBEAT_ENABLED, POWER_INTENT_EXPIRY_MULTIPLIER, POWER_INTENT_MIN_EXPIRY_SECONDS, + CRASH_RECOVERY_ENABLED, + CRASH_RECOVERY_GRACE_SECONDS, ) # Konfigurierbares Zeitfenster in Tagen (Standard: 7) WINDOW_DAYS = int(os.getenv("EVENTS_WINDOW_DAYS", "7")) @@ -275,8 +309,15 @@ def main(): client.on_connect = on_connect - client.connect("mqtt", 1883) + client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT) client.loop_start() + logging.info( + "MQTT connection configured host=%s port=%s tls=%s auth=%s", + MQTT_BROKER_HOST, + MQTT_BROKER_PORT, + MQTT_TLS_ENABLED, + bool(MQTT_USERNAME and MQTT_PASSWORD), + ) while True: now = datetime.datetime.now(datetime.timezone.utc) @@ -390,6 +431,51 @@ def main(): power_intent_metrics["retained_republish_total"], ) + if CRASH_RECOVERY_ENABLED: + try: + candidates = get_crash_recovery_candidates(CRASH_RECOVERY_GRACE_SECONDS) + if candidates: + logging.info("event=crash_recovery_scan candidates=%s", len(candidates)) + for candidate in candidates: + cuuid = candidate["uuid"] + reason = candidate["reason"] + try: + command_id, payload, topic, compat_topic = issue_crash_recovery_command( + client_uuid=cuuid, + reason=reason, + ) + result = client.publish(topic, json.dumps(payload), qos=1, retain=False) + result.wait_for_publish(timeout=5.0) + compat_result = client.publish(compat_topic, json.dumps(payload), qos=1, retain=False) + compat_result.wait_for_publish(timeout=5.0) + success = result.rc == mqtt.MQTT_ERR_SUCCESS + error = None if success else mqtt.error_string(result.rc) + finalize_crash_recovery_command(command_id, published=success, error=error) + if success: + logging.info( + "event=crash_recovery_command_issued client_uuid=%s reason=%s command_id=%s", + cuuid, reason, command_id, + ) + else: + logging.error( + "event=crash_recovery_publish_failed client_uuid=%s reason=%s command_id=%s error=%s", + cuuid, reason, command_id, error, + ) + except Exception as cmd_exc: + logging.error( + "event=crash_recovery_command_error client_uuid=%s reason=%s error=%s", + cuuid, reason, cmd_exc, + ) + except Exception as scan_exc: + logging.error("event=crash_recovery_scan_error error=%s", scan_exc) + + try: + expired_count = sweep_expired_commands() + if expired_count: + logging.info("event=command_expiry_sweep expired=%s", expired_count) + except Exception as sweep_exc: + logging.error("event=command_expiry_sweep_error error=%s", sweep_exc) + time.sleep(POLL_INTERVAL) diff --git a/server/alembic/versions/aa12bb34cc56_add_client_commands_table.py b/server/alembic/versions/aa12bb34cc56_add_client_commands_table.py new file mode 100644 index 0000000..957f28d --- /dev/null +++ b/server/alembic/versions/aa12bb34cc56_add_client_commands_table.py @@ -0,0 +1,63 @@ +"""add client commands table + +Revision ID: aa12bb34cc56 +Revises: f3c4d5e6a7b8 +Create Date: 2026-04-03 12:40:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'aa12bb34cc56' +down_revision: Union[str, None] = 'f3c4d5e6a7b8' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'client_commands', + sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('command_id', sa.String(length=36), nullable=False), + sa.Column('client_uuid', sa.String(length=36), nullable=False), + sa.Column('action', sa.String(length=32), nullable=False), + sa.Column('status', sa.String(length=40), nullable=False), + sa.Column('reason', sa.Text(), nullable=True), + sa.Column('requested_by', sa.Integer(), nullable=True), + sa.Column('issued_at', sa.TIMESTAMP(timezone=True), nullable=False), + sa.Column('expires_at', sa.TIMESTAMP(timezone=True), nullable=False), + sa.Column('published_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('acked_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('execution_started_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('completed_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('failed_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('error_code', sa.String(length=64), nullable=True), + sa.Column('error_message', sa.Text(), nullable=True), + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.current_timestamp(), nullable=False), + sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.current_timestamp(), nullable=False), + sa.ForeignKeyConstraint(['client_uuid'], ['clients.uuid'], ondelete='CASCADE'), + sa.ForeignKeyConstraint(['requested_by'], ['users.id'], ondelete='SET NULL'), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('command_id') + ) + + op.create_index(op.f('ix_client_commands_action'), 'client_commands', ['action'], unique=False) + op.create_index(op.f('ix_client_commands_client_uuid'), 'client_commands', ['client_uuid'], unique=False) + op.create_index(op.f('ix_client_commands_command_id'), 'client_commands', ['command_id'], unique=False) + op.create_index(op.f('ix_client_commands_requested_by'), 'client_commands', ['requested_by'], unique=False) + op.create_index(op.f('ix_client_commands_status'), 'client_commands', ['status'], unique=False) + op.create_index('ix_client_commands_client_status_created', 'client_commands', ['client_uuid', 'status', 'created_at'], unique=False) + + +def downgrade() -> None: + op.drop_index('ix_client_commands_client_status_created', table_name='client_commands') + op.drop_index(op.f('ix_client_commands_status'), table_name='client_commands') + op.drop_index(op.f('ix_client_commands_requested_by'), table_name='client_commands') + op.drop_index(op.f('ix_client_commands_command_id'), table_name='client_commands') + op.drop_index(op.f('ix_client_commands_client_uuid'), table_name='client_commands') + op.drop_index(op.f('ix_client_commands_action'), table_name='client_commands') + op.drop_table('client_commands') diff --git a/server/alembic/versions/b1c2d3e4f5a6_add_service_failed_and_mqtt_broker_fields.py b/server/alembic/versions/b1c2d3e4f5a6_add_service_failed_and_mqtt_broker_fields.py new file mode 100644 index 0000000..ff60540 --- /dev/null +++ b/server/alembic/versions/b1c2d3e4f5a6_add_service_failed_and_mqtt_broker_fields.py @@ -0,0 +1,43 @@ +"""add service_failed and mqtt broker connection fields to clients + +Revision ID: b1c2d3e4f5a6 +Revises: aa12bb34cc56 +Create Date: 2026-04-05 10:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'b1c2d3e4f5a6' +down_revision: Union[str, None] = 'aa12bb34cc56' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + bind = op.get_bind() + inspector = sa.inspect(bind) + existing = {c['name'] for c in inspector.get_columns('clients')} + + # Systemd service-failed tracking + if 'service_failed_at' not in existing: + op.add_column('clients', sa.Column('service_failed_at', sa.TIMESTAMP(timezone=True), nullable=True)) + if 'service_failed_unit' not in existing: + op.add_column('clients', sa.Column('service_failed_unit', sa.String(128), nullable=True)) + + # MQTT broker connection health + if 'mqtt_reconnect_count' not in existing: + op.add_column('clients', sa.Column('mqtt_reconnect_count', sa.Integer(), nullable=True)) + if 'mqtt_last_disconnect_at' not in existing: + op.add_column('clients', sa.Column('mqtt_last_disconnect_at', sa.TIMESTAMP(timezone=True), nullable=True)) + + +def downgrade() -> None: + op.drop_column('clients', 'mqtt_last_disconnect_at') + op.drop_column('clients', 'mqtt_reconnect_count') + op.drop_column('clients', 'service_failed_unit') + op.drop_column('clients', 'service_failed_at') diff --git a/server/init_defaults.py b/server/init_defaults.py index 7be8eb8..12e8105 100644 --- a/server/init_defaults.py +++ b/server/init_defaults.py @@ -1,4 +1,4 @@ -from sqlalchemy import create_engine, text +from sqlalchemy import create_engine, textmosquitto.conf import os from dotenv import load_dotenv import bcrypt diff --git a/server/routes/client_logs.py b/server/routes/client_logs.py index f208728..53438ba 100644 --- a/server/routes/client_logs.py +++ b/server/routes/client_logs.py @@ -421,6 +421,8 @@ def get_monitoring_overview(): }, "latest_log": _serialize_log_entry(latest_log), "latest_error": _serialize_log_entry(latest_error), + "mqtt_reconnect_count": client.mqtt_reconnect_count, + "mqtt_last_disconnect_at": client.mqtt_last_disconnect_at.isoformat() if client.mqtt_last_disconnect_at else None, }) summary_counts["total_clients"] += 1 diff --git a/server/routes/clients.py b/server/routes/clients.py index 653cead..a39e464 100644 --- a/server/routes/clients.py +++ b/server/routes/clients.py @@ -1,7 +1,8 @@ from server.database import Session -from models.models import Client, ClientGroup -from flask import Blueprint, request, jsonify +from models.models import Client, ClientGroup, ClientCommand, ProcessStatus +from flask import Blueprint, request, jsonify, session as flask_session from server.permissions import admin_or_higher +from server.routes.groups import get_grace_period, is_client_alive from server.mqtt_helper import publish_client_group, delete_client_group_message, publish_multiple_client_groups import sys import os @@ -9,13 +10,196 @@ import glob import base64 import hashlib import json -from datetime import datetime, timezone +import uuid as uuid_lib +from datetime import datetime, timezone, timedelta sys.path.append('/workspace') clients_bp = Blueprint("clients", __name__, url_prefix="/api/clients") VALID_SCREENSHOT_TYPES = {"periodic", "event_start", "event_stop"} +COMMAND_SCHEMA_VERSION = "1.0" +COMMAND_TOPIC_TEMPLATE = "infoscreen/{uuid}/commands" +COMMAND_TOPIC_COMPAT_TEMPLATE = "infoscreen/{uuid}/command" +LEGACY_RESTART_TOPIC_TEMPLATE = "clients/{uuid}/restart" +COMMAND_EXPIRY_SECONDS = 240 +REBOOT_LOCKOUT_WINDOW_MINUTES = 15 +REBOOT_LOCKOUT_THRESHOLD = 3 +API_ACTION_TO_COMMAND_ACTION = { + "restart": "reboot_host", + "shutdown": "shutdown_host", + "restart_app": "restart_app", +} +ALLOWED_COMMAND_ACTIONS = set(API_ACTION_TO_COMMAND_ACTION.keys()) + + +def _iso_utc_z(ts: datetime) -> str: + return ts.astimezone(timezone.utc).isoformat().replace("+00:00", "Z") + + +def _command_to_dict(command: ClientCommand) -> dict: + return { + "commandId": command.command_id, + "clientUuid": command.client_uuid, + "action": command.action, + "status": command.status, + "reason": command.reason, + "requestedBy": command.requested_by, + "issuedAt": command.issued_at.isoformat() if command.issued_at else None, + "expiresAt": command.expires_at.isoformat() if command.expires_at else None, + "publishedAt": command.published_at.isoformat() if command.published_at else None, + "ackedAt": command.acked_at.isoformat() if command.acked_at else None, + "executionStartedAt": command.execution_started_at.isoformat() if command.execution_started_at else None, + "completedAt": command.completed_at.isoformat() if command.completed_at else None, + "failedAt": command.failed_at.isoformat() if command.failed_at else None, + "errorCode": command.error_code, + "errorMessage": command.error_message, + "createdAt": command.created_at.isoformat() if command.created_at else None, + "updatedAt": command.updated_at.isoformat() if command.updated_at else None, + } + + +def _publish_client_command(client_uuid: str, action: str, payload: dict) -> None: + import paho.mqtt.client as mqtt + + broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt") + broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883)) + username = os.getenv("MQTT_USER") + password = os.getenv("MQTT_PASSWORD") + + mqtt_client = mqtt.Client() + if username and password: + mqtt_client.username_pw_set(username, password) + + mqtt_client.connect(broker_host, broker_port) + + # Primary topic for contract-based command handling. + command_topic = COMMAND_TOPIC_TEMPLATE.format(uuid=client_uuid) + result = mqtt_client.publish(command_topic, json.dumps(payload), qos=1, retain=False) + result.wait_for_publish(timeout=5.0) + + # Transitional compatibility for clients that still consume singular topic naming. + compat_topic = COMMAND_TOPIC_COMPAT_TEMPLATE.format(uuid=client_uuid) + compat_result = mqtt_client.publish(compat_topic, json.dumps(payload), qos=1, retain=False) + compat_result.wait_for_publish(timeout=5.0) + + # Transitional compatibility for existing restart-only clients. + if action == "restart": + legacy_topic = LEGACY_RESTART_TOPIC_TEMPLATE.format(uuid=client_uuid) + legacy_payload = {"action": "restart"} + legacy_result = mqtt_client.publish(legacy_topic, json.dumps(legacy_payload), qos=1, retain=False) + legacy_result.wait_for_publish(timeout=5.0) + + mqtt_client.disconnect() + + +def _issue_client_command(client_uuid: str, action: str): + if action not in ALLOWED_COMMAND_ACTIONS: + return jsonify({"error": f"Unsupported action '{action}'"}), 400 + + command_action = API_ACTION_TO_COMMAND_ACTION[action] + + data = request.get_json(silent=True) or {} + reason = str(data.get("reason", "")).strip() or None + requested_by = flask_session.get("user_id") + + now_utc = datetime.now(timezone.utc) + expires_at = now_utc + timedelta(seconds=COMMAND_EXPIRY_SECONDS) + command_id = str(uuid_lib.uuid4()) + + db = Session() + try: + client = db.query(Client).filter_by(uuid=client_uuid).first() + if not client: + return jsonify({"error": "Client nicht gefunden"}), 404 + + # Safety lockout: avoid rapid repeated reboot loops per client. + if command_action in ("reboot_host", "restart_app"): + window_start = now_utc - timedelta(minutes=REBOOT_LOCKOUT_WINDOW_MINUTES) + recent_reboots = ( + db.query(ClientCommand) + .filter(ClientCommand.client_uuid == client_uuid) + .filter(ClientCommand.action.in_(["reboot_host", "restart_app"])) + .filter(ClientCommand.created_at >= window_start) + .count() + ) + if recent_reboots >= REBOOT_LOCKOUT_THRESHOLD: + blocked = ClientCommand( + command_id=command_id, + client_uuid=client_uuid, + action=command_action, + status="blocked_safety", + reason=reason, + requested_by=requested_by, + issued_at=now_utc, + expires_at=expires_at, + failed_at=now_utc, + error_code="lockout_threshold", + error_message="Reboot lockout active for this client", + ) + db.add(blocked) + db.commit() + return jsonify({ + "success": False, + "message": "Neustart voruebergehend blockiert (Sicherheits-Lockout)", + "command": _command_to_dict(blocked), + }), 429 + + command = ClientCommand( + command_id=command_id, + client_uuid=client_uuid, + action=command_action, + status="queued", + reason=reason, + requested_by=requested_by, + issued_at=now_utc, + expires_at=expires_at, + ) + db.add(command) + db.commit() + + command.status = "publish_in_progress" + db.commit() + + payload = { + "schema_version": COMMAND_SCHEMA_VERSION, + "command_id": command.command_id, + "client_uuid": command.client_uuid, + "action": command.action, + "issued_at": _iso_utc_z(command.issued_at), + "expires_at": _iso_utc_z(command.expires_at), + "requested_by": command.requested_by, + "reason": command.reason, + } + + try: + _publish_client_command(client_uuid=client_uuid, action=action, payload=payload) + # ACK can arrive very quickly (including terminal failure) while publish is in-flight. + # Refresh to avoid regressing a newer listener-updated state back to "published". + db.refresh(command) + command.published_at = command.published_at or datetime.now(timezone.utc) + if command.status in {"queued", "publish_in_progress"}: + command.status = "published" + db.commit() + return jsonify({ + "success": True, + "message": f"Command published for client {client_uuid}", + "command": _command_to_dict(command), + }), 202 + except Exception as publish_error: + command.status = "failed" + command.failed_at = datetime.now(timezone.utc) + command.error_code = "mqtt_publish_failed" + command.error_message = str(publish_error) + db.commit() + return jsonify({ + "success": False, + "error": f"Failed to publish command: {publish_error}", + "command": _command_to_dict(command), + }), 500 + finally: + db.close() + def _normalize_screenshot_type(raw_type): if raw_type is None: @@ -280,45 +464,148 @@ def get_clients_with_alive_status(): "ip": c.ip, "last_alive": c.last_alive.isoformat() if c.last_alive else None, "is_active": c.is_active, - "is_alive": bool(c.last_alive and c.is_active), + "is_alive": is_client_alive(c.last_alive, c.is_active), }) session.close() return jsonify(result) +@clients_bp.route("/crashed", methods=["GET"]) +@admin_or_higher +def get_crashed_clients(): + """Returns clients that are crashed (process_status=crashed) or heartbeat-stale.""" + session = Session() + try: + from datetime import timedelta + grace = get_grace_period() + from datetime import datetime, timezone + stale_cutoff = datetime.now(timezone.utc) - timedelta(seconds=grace) + clients = ( + session.query(Client) + .filter(Client.is_active == True) + .all() + ) + result = [] + for c in clients: + alive = is_client_alive(c.last_alive, c.is_active) + crashed = c.process_status == ProcessStatus.crashed + if not alive or crashed: + result.append({ + "uuid": c.uuid, + "description": c.description, + "hostname": c.hostname, + "ip": c.ip, + "group_id": c.group_id, + "is_alive": alive, + "process_status": c.process_status.value if c.process_status else None, + "screen_health_status": c.screen_health_status.value if c.screen_health_status else None, + "last_alive": c.last_alive.isoformat() if c.last_alive else None, + "crash_reason": "process_crashed" if crashed else "heartbeat_stale", + }) + return jsonify({ + "crashed_count": len(result), + "grace_period_seconds": grace, + "clients": result, + }) + finally: + session.close() + + +@clients_bp.route("/service_failed", methods=["GET"]) +@admin_or_higher +def get_service_failed_clients(): + """Returns clients that have a service_failed_at set (systemd gave up restarting).""" + session = Session() + try: + clients = ( + session.query(Client) + .filter(Client.service_failed_at.isnot(None)) + .order_by(Client.service_failed_at.desc()) + .all() + ) + result = [ + { + "uuid": c.uuid, + "description": c.description, + "hostname": c.hostname, + "ip": c.ip, + "group_id": c.group_id, + "service_failed_at": c.service_failed_at.isoformat() if c.service_failed_at else None, + "service_failed_unit": c.service_failed_unit, + "is_alive": is_client_alive(c.last_alive, c.is_active), + "last_alive": c.last_alive.isoformat() if c.last_alive else None, + } + for c in clients + ] + return jsonify({"service_failed_count": len(result), "clients": result}) + finally: + session.close() + + +@clients_bp.route("//clear_service_failed", methods=["POST"]) +@admin_or_higher +def clear_service_failed(client_uuid): + """Clears the service_failed flag for a client and deletes the retained MQTT message.""" + import paho.mqtt.client as mqtt_lib + + session = Session() + try: + c = session.query(Client).filter_by(uuid=client_uuid).first() + if not c: + return jsonify({"error": "Client nicht gefunden"}), 404 + if c.service_failed_at is None: + return jsonify({"success": True, "message": "Kein service_failed Flag gesetzt."}), 200 + + c.service_failed_at = None + c.service_failed_unit = None + session.commit() + finally: + session.close() + + # Clear the retained MQTT message (publish empty payload, retained=True) + try: + broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt") + broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883)) + username = os.getenv("MQTT_USER") + password = os.getenv("MQTT_PASSWORD") + mc = mqtt_lib.Client() + if username and password: + mc.username_pw_set(username, password) + mc.connect(broker_host, broker_port) + topic = f"infoscreen/{client_uuid}/service_failed" + mc.publish(topic, payload=None, qos=1, retain=True) + mc.disconnect() + except Exception as e: + # Log but don't fail — DB is already cleared + import logging + logging.warning(f"Could not clear retained service_failed MQTT message for {client_uuid}: {e}") + + return jsonify({"success": True, "message": "service_failed Flag gelöscht."}) + + @clients_bp.route("//restart", methods=["POST"]) @admin_or_higher def restart_client(uuid): - """ - Route to restart a specific client by UUID. - Sends an MQTT message to the broker to trigger the restart. - """ - import paho.mqtt.client as mqtt - import json + return _issue_client_command(client_uuid=uuid, action="restart") - # MQTT broker configuration - MQTT_BROKER = "mqtt" - MQTT_PORT = 1883 - MQTT_TOPIC = f"clients/{uuid}/restart" - # Connect to the database to check if the client exists - session = Session() - client = session.query(Client).filter_by(uuid=uuid).first() - if not client: - session.close() - return jsonify({"error": "Client nicht gefunden"}), 404 - session.close() +@clients_bp.route("//shutdown", methods=["POST"]) +@admin_or_higher +def shutdown_client(uuid): + return _issue_client_command(client_uuid=uuid, action="shutdown") - # Send MQTT message + +@clients_bp.route("/commands/", methods=["GET"]) +@admin_or_higher +def get_client_command_status(command_id): + db = Session() try: - mqtt_client = mqtt.Client() - mqtt_client.connect(MQTT_BROKER, MQTT_PORT) - payload = {"action": "restart"} - mqtt_client.publish(MQTT_TOPIC, json.dumps(payload)) - mqtt_client.disconnect() - return jsonify({"success": True, "message": f"Restart signal sent to client {uuid}"}), 200 - except Exception as e: - return jsonify({"error": f"Failed to send MQTT message: {str(e)}"}), 500 + command = db.query(ClientCommand).filter_by(command_id=command_id).first() + if not command: + return jsonify({"error": "Command nicht gefunden"}), 404 + return jsonify(_command_to_dict(command)), 200 + finally: + db.close() @clients_bp.route("//screenshot", methods=["POST"])