feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep
- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
20
.env.example
20
.env.example
@@ -20,8 +20,18 @@ DB_HOST=db
|
|||||||
# MQTT
|
# MQTT
|
||||||
MQTT_BROKER_HOST=mqtt
|
MQTT_BROKER_HOST=mqtt
|
||||||
MQTT_BROKER_PORT=1883
|
MQTT_BROKER_PORT=1883
|
||||||
# MQTT_USER=your_mqtt_user
|
# Required for authenticated broker access
|
||||||
# MQTT_PASSWORD=your_mqtt_password
|
MQTT_USER=your_mqtt_user
|
||||||
|
MQTT_PASSWORD=replace_with_a_32plus_char_random_password
|
||||||
|
# Optional: dedicated canary client account
|
||||||
|
MQTT_CANARY_USER=your_canary_mqtt_user
|
||||||
|
MQTT_CANARY_PASSWORD=replace_with_a_different_32plus_char_random_password
|
||||||
|
# Optional TLS settings
|
||||||
|
MQTT_TLS_ENABLED=false
|
||||||
|
MQTT_TLS_CA_CERT=
|
||||||
|
MQTT_TLS_CERTFILE=
|
||||||
|
MQTT_TLS_KEYFILE=
|
||||||
|
MQTT_TLS_INSECURE=false
|
||||||
MQTT_KEEPALIVE=60
|
MQTT_KEEPALIVE=60
|
||||||
|
|
||||||
# Dashboard
|
# Dashboard
|
||||||
@@ -39,6 +49,12 @@ HEARTBEAT_GRACE_PERIOD_PROD=170
|
|||||||
# Optional: force periodic republish even without changes
|
# Optional: force periodic republish even without changes
|
||||||
# REFRESH_SECONDS=0
|
# REFRESH_SECONDS=0
|
||||||
|
|
||||||
|
# Crash recovery (scheduler auto-recovery)
|
||||||
|
# CRASH_RECOVERY_ENABLED=false
|
||||||
|
# CRASH_RECOVERY_GRACE_SECONDS=180
|
||||||
|
# CRASH_RECOVERY_LOCKOUT_MINUTES=15
|
||||||
|
# CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS=240
|
||||||
|
|
||||||
# Default superadmin bootstrap (server/init_defaults.py)
|
# Default superadmin bootstrap (server/init_defaults.py)
|
||||||
# REQUIRED: Must be set for superadmin creation
|
# REQUIRED: Must be set for superadmin creation
|
||||||
DEFAULT_SUPERADMIN_USERNAME=superadmin
|
DEFAULT_SUPERADMIN_USERNAME=superadmin
|
||||||
|
|||||||
12
.github/copilot-instructions.md
vendored
12
.github/copilot-instructions.md
vendored
@@ -13,15 +13,16 @@ It is not a changelog and not a full architecture handbook.
|
|||||||
- Keep changes minimal, match existing patterns, and update docs in the same commit when behavior changes.
|
- Keep changes minimal, match existing patterns, and update docs in the same commit when behavior changes.
|
||||||
|
|
||||||
## Fast file map
|
## Fast file map
|
||||||
- `scheduler/scheduler.py` - scheduler loop, MQTT event publishing, TV power intent publishing
|
- `scheduler/scheduler.py` - scheduler loop, MQTT event publishing, TV power intent publishing, crash auto-recovery, command expiry sweep
|
||||||
- `scheduler/db_utils.py` - event formatting and power-intent helper logic
|
- `scheduler/db_utils.py` - event formatting, power-intent helpers, crash recovery helpers, command expiry sweep
|
||||||
- `listener/listener.py` - discovery/heartbeat/log/screenshot MQTT consumption
|
- `listener/listener.py` - discovery/heartbeat/log/screenshot/service_failed MQTT consumption
|
||||||
- `server/init_academic_periods.py` - idempotent academic-period seeding + auto-activation for current date
|
- `server/init_academic_periods.py` - idempotent academic-period seeding + auto-activation for current date
|
||||||
- `server/initialize_database.py` - migration + bootstrap orchestration for local/manual setup
|
- `server/initialize_database.py` - migration + bootstrap orchestration for local/manual setup
|
||||||
- `server/routes/events.py` - event CRUD, recurrence handling, UTC normalization
|
- `server/routes/events.py` - event CRUD, recurrence handling, UTC normalization
|
||||||
- `server/routes/eventmedia.py` - file manager, media upload/stream endpoints
|
- `server/routes/eventmedia.py` - file manager, media upload/stream endpoints
|
||||||
- `server/routes/groups.py` - group lifecycle, alive status, order persistence
|
- `server/routes/groups.py` - group lifecycle, alive status, order persistence
|
||||||
- `server/routes/system_settings.py` - system settings CRUD and supplement-table endpoint
|
- `server/routes/system_settings.py` - system settings CRUD and supplement-table endpoint
|
||||||
|
- `server/routes/clients.py` - client metadata, restart/shutdown/restart_app command issuing, command status, crashed/service_failed alert endpoints
|
||||||
- `dashboard/src/settings.tsx` - settings UX and system-defaults integration
|
- `dashboard/src/settings.tsx` - settings UX and system-defaults integration
|
||||||
- `dashboard/src/components/CustomEventModal.tsx` - event creation/editing UX
|
- `dashboard/src/components/CustomEventModal.tsx` - event creation/editing UX
|
||||||
- `dashboard/src/monitoring.tsx` - superadmin monitoring page
|
- `dashboard/src/monitoring.tsx` - superadmin monitoring page
|
||||||
@@ -54,6 +55,9 @@ It is not a changelog and not a full architecture handbook.
|
|||||||
- Logs topic family: `infoscreen/{uuid}/logs/{error|warn|info}`
|
- Logs topic family: `infoscreen/{uuid}/logs/{error|warn|info}`
|
||||||
- Health topic: `infoscreen/{uuid}/health`
|
- Health topic: `infoscreen/{uuid}/health`
|
||||||
- Dashboard screenshot topic: `infoscreen/{uuid}/dashboard`
|
- Dashboard screenshot topic: `infoscreen/{uuid}/dashboard`
|
||||||
|
- Client command topic (QoS1, non-retained): `infoscreen/{uuid}/commands` (compat alias: `infoscreen/{uuid}/command`)
|
||||||
|
- Client command ack topic (QoS1, non-retained): `infoscreen/{uuid}/commands/ack` (compat alias: `infoscreen/{uuid}/command/ack`)
|
||||||
|
- Service-failed topic (retained, client→server): `infoscreen/{uuid}/service_failed`
|
||||||
- TV power intent Phase 1 topic (retained, QoS1): `infoscreen/groups/{group_id}/power/intent`
|
- TV power intent Phase 1 topic (retained, QoS1): `infoscreen/groups/{group_id}/power/intent`
|
||||||
|
|
||||||
TV power intent Phase 1 rules:
|
TV power intent Phase 1 rules:
|
||||||
@@ -82,7 +86,9 @@ TV power intent Phase 1 rules:
|
|||||||
- Scheduler: `POLL_INTERVAL_SECONDS`, `REFRESH_SECONDS`
|
- Scheduler: `POLL_INTERVAL_SECONDS`, `REFRESH_SECONDS`
|
||||||
- Power intent: `POWER_INTENT_PUBLISH_ENABLED`, `POWER_INTENT_HEARTBEAT_ENABLED`, `POWER_INTENT_EXPIRY_MULTIPLIER`, `POWER_INTENT_MIN_EXPIRY_SECONDS`
|
- Power intent: `POWER_INTENT_PUBLISH_ENABLED`, `POWER_INTENT_HEARTBEAT_ENABLED`, `POWER_INTENT_EXPIRY_MULTIPLIER`, `POWER_INTENT_MIN_EXPIRY_SECONDS`
|
||||||
- Monitoring: `PRIORITY_SCREENSHOT_TTL_SECONDS`
|
- Monitoring: `PRIORITY_SCREENSHOT_TTL_SECONDS`
|
||||||
|
- Crash recovery: `CRASH_RECOVERY_ENABLED`, `CRASH_RECOVERY_GRACE_SECONDS`, `CRASH_RECOVERY_LOCKOUT_MINUTES`, `CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS`
|
||||||
- Core: `DB_CONN`, `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_NAME`, `ENV`
|
- Core: `DB_CONN`, `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_NAME`, `ENV`
|
||||||
|
- MQTT auth/connectivity: `MQTT_BROKER_HOST`, `MQTT_BROKER_PORT`, `MQTT_USER`, `MQTT_PASSWORD` (listener/scheduler/server should use authenticated broker access)
|
||||||
|
|
||||||
## Edit guardrails
|
## Edit guardrails
|
||||||
- Do not edit generated assets in `dashboard/dist/`.
|
- Do not edit generated assets in `dashboard/dist/`.
|
||||||
|
|||||||
@@ -5,6 +5,31 @@ This changelog tracks all changes made in the development workspace, including i
|
|||||||
---
|
---
|
||||||
|
|
||||||
## Unreleased (development workspace)
|
## Unreleased (development workspace)
|
||||||
|
- Crash detection API: Added `GET /api/clients/crashed` returning clients with `process_status=crashed` or stale heartbeat; includes `crash_reason` field (`process_crashed` | `heartbeat_stale`).
|
||||||
|
- Crash auto-recovery (scheduler): Feature-flagged loop (`CRASH_RECOVERY_ENABLED`) scans crash candidates, issues `reboot_host` command, publishes to primary + compat MQTT topics; lockout window and expiry configurable via env.
|
||||||
|
- Command expiry sweep (scheduler): Unconditional per-cycle sweep in `sweep_expired_commands()` marks non-terminal `ClientCommand` rows past `expires_at` as `expired`.
|
||||||
|
- `restart_app` action registered in `server/routes/clients.py` API action map; sends same command lifecycle as `reboot_host`; safety lockout covers both actions.
|
||||||
|
- `service_failed` listener: subscribes to `infoscreen/+/service_failed` on every connect; persists `service_failed_at` + `service_failed_unit` to `Client`; empty payload (retain clear) silently ignored.
|
||||||
|
- Broker connection health: Listener health handler now extracts `broker_connection.reconnect_count` + `broker_connection.last_disconnect_at` and persists to `Client`.
|
||||||
|
- DB migration `b1c2d3e4f5a6`: adds `service_failed_at`, `service_failed_unit`, `mqtt_reconnect_count`, `mqtt_last_disconnect_at` to `clients` table.
|
||||||
|
- Model update: `models/models.py` Client class updated with all four new columns.
|
||||||
|
- `GET /api/clients/service_failed`: lists clients with `service_failed_at` set, admin-or-higher gated.
|
||||||
|
- `POST /api/clients/<uuid>/clear_service_failed`: clears DB flag and publishes empty retained MQTT to `infoscreen/{uuid}/service_failed`.
|
||||||
|
- Monitoring overview includes `mqtt_reconnect_count` + `mqtt_last_disconnect_at` per client.
|
||||||
|
- Frontend monitoring: orange service-failed alert panel (hidden when count=0), auto-refresh 15s, per-row Quittieren action.
|
||||||
|
- Frontend monitoring: client detail now shows MQTT reconnect count + last disconnect timestamp.
|
||||||
|
- Frontend types: `ServiceFailedClient`, `ServiceFailedClientsResponse`; helpers `fetchServiceFailedClients()`, `clearServiceFailed()` added to `dashboard/src/apiClients.ts`.
|
||||||
|
- `MQTT_EVENT_PAYLOAD_GUIDE.md`: added `service_failed` topic contract.
|
||||||
|
- MQTT auth hardening: Listener and scheduler now connect to broker with env-configured credentials (`MQTT_BROKER_HOST`, `MQTT_BROKER_PORT`, `MQTT_USER`, `MQTT_PASSWORD`) instead of anonymous fixed host/port defaults; optional TLS env toggles added in code path (`MQTT_TLS_*`).
|
||||||
|
- Broker auth enforcement: `mosquitto/config/mosquitto.conf` now disables anonymous access and enables password-file authentication. `docker-compose.yml` MQTT service now bootstraps/update password entries from env (`MQTT_USER`/`MQTT_PASSWORD`, optional canary user) before starting broker.
|
||||||
|
- Compose wiring: Added MQTT credential env propagation for listener/scheduler in both base and dev override compose files and switched MQTT healthcheck publish to authenticated mode.
|
||||||
|
- Backend implementation: Introduced client command lifecycle foundation for remote control in `server/routes/clients.py` with command persistence (`ClientCommand`), schema-based MQTT publish to `infoscreen/{uuid}/commands` (QoS1, non-retained), new endpoints `POST /api/clients/<uuid>/shutdown` and `GET /api/clients/commands/<command_id>`, and restart safety lockout (`blocked_safety` after 3 restarts in 15 minutes). Added migration `server/alembic/versions/aa12bb34cc56_add_client_commands_table.py` and model updates in `models/models.py`. Restart path keeps transitional legacy MQTT publish to `clients/{uuid}/restart` for compatibility.
|
||||||
|
- Listener integration: `listener/listener.py` now subscribes to `infoscreen/+/commands/ack` and updates command lifecycle states from client ACK payloads (`accepted`, `execution_started`, `completed`, `failed`).
|
||||||
|
- Frontend API client prep: Extended `dashboard/src/apiClients.ts` with `ClientCommand` typing and helper calls for lifecycle consumption (`shutdownClient`, `fetchClientCommandStatus`), and updated `restartClient` to accept optional reason payload.
|
||||||
|
- Contract freeze clarification: implementation-plan docs now explicitly freeze canonical MQTT topics (`infoscreen/{uuid}/commands`, `infoscreen/{uuid}/commands/ack`) and JSON schemas with examples; added transitional singular-topic compatibility aliases (`infoscreen/{uuid}/command`, `infoscreen/{uuid}/command/ack`) in server publish and listener ingest.
|
||||||
|
- Action value canonicalization: command payload actions are now frozen as host-level values (`reboot_host`, `shutdown_host`). API endpoint mapping is explicit (`/restart` -> `reboot_host`, `/shutdown` -> `shutdown_host`), and docs/examples were updated to remove `restart` payload ambiguity.
|
||||||
|
- Client helper snippets: Added frozen payload validation artifacts `implementation-plans/reboot-command-payload-schemas.md` and `implementation-plans/reboot-command-payload-schemas.json` (copy-ready snippets plus machine-validated JSON Schema).
|
||||||
|
- Documentation alignment: Added active reboot implementation handoff docs under `implementation-plans/` and linked them in `README.md` for immediate cross-team access (`reboot-implementation-handoff-share.md`, `reboot-implementation-handoff-client-team.md`, `reboot-kickoff-summary.md`).
|
||||||
- Programminfo GUI regression/fix: `dashboard/public/program-info.json` could not be loaded in Programminfo menu due to invalid JSON in the new alpha.16 changelog line (malformed quote in a text entry). Fixed JSON entry and verified file parses correctly again.
|
- Programminfo GUI regression/fix: `dashboard/public/program-info.json` could not be loaded in Programminfo menu due to invalid JSON in the new alpha.16 changelog line (malformed quote in a text entry). Fixed JSON entry and verified file parses correctly again.
|
||||||
- Dashboard holiday banner fix: `dashboard/src/dashboard.tsx` — `loadHolidayStatus` now uses a stable `useCallback` with empty deps, preventing repeated re-creation on render. `useEffect` depends only on the stable callback reference.
|
- Dashboard holiday banner fix: `dashboard/src/dashboard.tsx` — `loadHolidayStatus` now uses a stable `useCallback` with empty deps, preventing repeated re-creation on render. `useEffect` depends only on the stable callback reference.
|
||||||
- Dashboard Syncfusion stale-render fix: `MessageComponent` in the holiday banner now receives `key={`${severity}:${text}`}` to force remount when severity or text changes; without this Syncfusion cached stale DOM and the banner did not update reactively.
|
- Dashboard Syncfusion stale-render fix: `MessageComponent` in the holiday banner now receives `key={`${severity}:${text}`}` to force remount when severity or text changes; without this Syncfusion cached stale DOM and the banner did not update reactively.
|
||||||
|
|||||||
@@ -50,6 +50,91 @@ Contract notes:
|
|||||||
- Heartbeat republishes keep `intent_id` stable while refreshing `issued_at` and `expires_at`.
|
- Heartbeat republishes keep `intent_id` stable while refreshing `issued_at` and `expires_at`.
|
||||||
- Expiry is poll-based: `max(3 x poll_interval_sec, 90)`.
|
- Expiry is poll-based: `max(3 x poll_interval_sec, 90)`.
|
||||||
|
|
||||||
|
### Service-Failed Notification (client → server, retained)
|
||||||
|
- **Topic**: `infoscreen/{uuid}/service_failed`
|
||||||
|
- **QoS**: 1
|
||||||
|
- **Retained**: Yes
|
||||||
|
- **Direction**: client → server
|
||||||
|
- **Purpose**: Client signals that systemd has exhausted restart attempts (`StartLimitBurst` exceeded) — manual intervention is required.
|
||||||
|
|
||||||
|
Example payload:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"event": "service_failed",
|
||||||
|
"unit": "infoscreen-simclient.service",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"failed_at": "2026-04-05T08:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Contract notes:
|
||||||
|
- Message is retained so the server receives it even after a broker restart.
|
||||||
|
- Server persists `service_failed_at` and `service_failed_unit` to the `clients` table.
|
||||||
|
- To clear after resolution: `POST /api/clients/<uuid>/clear_service_failed` — clears the DB flag and publishes an empty retained payload to delete the retained message from the broker.
|
||||||
|
- Empty payload (empty bytes) on this topic = retain-clear in transit; listener ignores it.
|
||||||
|
|
||||||
|
### Client Command Intent (Phase 1)
|
||||||
|
- **Topic**: `infoscreen/{uuid}/commands`
|
||||||
|
- **QoS**: 1
|
||||||
|
- **Retained**: No
|
||||||
|
- **Format**: JSON object
|
||||||
|
- **Purpose**: Per-client control commands (currently `restart` and `shutdown`)
|
||||||
|
|
||||||
|
Compatibility note:
|
||||||
|
- During restart transition, server also publishes legacy restart command to `clients/{uuid}/restart` with payload `{ "action": "restart" }`.
|
||||||
|
- During topic naming transition, server also publishes command payload to `infoscreen/{uuid}/command`.
|
||||||
|
|
||||||
|
Example payload:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": "2026-04-03T12:48:10Z",
|
||||||
|
"expires_at": "2026-04-03T12:52:10Z",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Contract notes:
|
||||||
|
- Clients must reject stale commands where local UTC time is greater than `expires_at`.
|
||||||
|
- Clients must deduplicate by `command_id` and never execute a duplicate command twice.
|
||||||
|
- `schema_version` is required for forward-compatibility.
|
||||||
|
- Allowed command action values in v1: `reboot_host`, `shutdown_host`, `restart_app`.
|
||||||
|
- `restart_app` = soft app restart (no OS reboot); `reboot_host` = full OS reboot.
|
||||||
|
- API mapping for operators: restart endpoint emits `reboot_host`; shutdown endpoint emits `shutdown_host`.
|
||||||
|
|
||||||
|
### Client Command Acknowledgements (Phase 1)
|
||||||
|
- **Topic**: `infoscreen/{uuid}/commands/ack`
|
||||||
|
- **QoS**: 1 (recommended)
|
||||||
|
- **Retained**: No
|
||||||
|
- **Format**: JSON object
|
||||||
|
- **Purpose**: Client reports command lifecycle progression back to server
|
||||||
|
|
||||||
|
Compatibility note:
|
||||||
|
- During topic naming transition, listener also accepts acknowledgements from `infoscreen/{uuid}/command/ack`.
|
||||||
|
|
||||||
|
Example payload:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"status": "execution_started",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Allowed `status` values:
|
||||||
|
- `accepted`
|
||||||
|
- `execution_started`
|
||||||
|
- `completed`
|
||||||
|
- `failed`
|
||||||
|
|
||||||
## Message Structure
|
## Message Structure
|
||||||
|
|
||||||
### General Principles
|
### General Principles
|
||||||
|
|||||||
@@ -192,9 +192,18 @@ Rollout strategy (Phase 1):
|
|||||||
- [AI-INSTRUCTIONS-MAINTENANCE.md](AI-INSTRUCTIONS-MAINTENANCE.md)
|
- [AI-INSTRUCTIONS-MAINTENANCE.md](AI-INSTRUCTIONS-MAINTENANCE.md)
|
||||||
- [DEV-CHANGELOG.md](DEV-CHANGELOG.md)
|
- [DEV-CHANGELOG.md](DEV-CHANGELOG.md)
|
||||||
|
|
||||||
|
### Active Implementation Plans
|
||||||
|
- [implementation-plans/reboot-implementation-handoff-share.md](implementation-plans/reboot-implementation-handoff-share.md)
|
||||||
|
- [implementation-plans/reboot-implementation-handoff-client-team.md](implementation-plans/reboot-implementation-handoff-client-team.md)
|
||||||
|
- [implementation-plans/reboot-kickoff-summary.md](implementation-plans/reboot-kickoff-summary.md)
|
||||||
|
|
||||||
## API Highlights
|
## API Highlights
|
||||||
|
|
||||||
- Core resources: clients, groups, events, academic periods
|
- Core resources: clients, groups, events, academic periods
|
||||||
|
- Client command lifecycle:
|
||||||
|
- `POST /api/clients/<uuid>/restart`
|
||||||
|
- `POST /api/clients/<uuid>/shutdown`
|
||||||
|
- `GET /api/clients/commands/<command_id>`
|
||||||
- Holidays: `GET/POST /api/holidays`, `POST /api/holidays/upload`, `PUT/DELETE /api/holidays/<id>`
|
- Holidays: `GET/POST /api/holidays`, `POST /api/holidays/upload`, `PUT/DELETE /api/holidays/<id>`
|
||||||
- Media: upload/download/stream + conversion status
|
- Media: upload/download/stream + conversion status
|
||||||
- Auth: login/logout/change-password
|
- Auth: login/logout/change-password
|
||||||
|
|||||||
149
RESTART_VALIDATION_CHECKLIST.md
Normal file
149
RESTART_VALIDATION_CHECKLIST.md
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
# Restart Validation Checklist
|
||||||
|
|
||||||
|
Purpose: Validate end-to-end restart command flow after MQTT auth hardening.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- API command issue route: `POST /api/clients/{uuid}/restart`
|
||||||
|
- MQTT command topic: `infoscreen/{uuid}/commands` (compat: `infoscreen/{uuid}/command`)
|
||||||
|
- MQTT ACK topic: `infoscreen/{uuid}/commands/ack` (compat: `infoscreen/{uuid}/command/ack`)
|
||||||
|
- Status API: `GET /api/clients/commands/{command_id}`
|
||||||
|
|
||||||
|
## Preconditions
|
||||||
|
|
||||||
|
- Stack is up and healthy (`db`, `mqtt`, `server`, `listener`, `scheduler`).
|
||||||
|
- You have an `admin` or `superadmin` account.
|
||||||
|
- At least one canary client is online and can process restart commands.
|
||||||
|
- `.env` has valid `MQTT_USER` / `MQTT_PASSWORD`.
|
||||||
|
|
||||||
|
## 1) Open Monitoring Session (MQTT)
|
||||||
|
|
||||||
|
On host/server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
set -a
|
||||||
|
. ./.env
|
||||||
|
set +a
|
||||||
|
|
||||||
|
mosquitto_sub -h 127.0.0.1 -p 1883 \
|
||||||
|
-u "$MQTT_USER" -P "$MQTT_PASSWORD" \
|
||||||
|
-t "infoscreen/+/commands" \
|
||||||
|
-t "infoscreen/+/commands/ack" \
|
||||||
|
-t "infoscreen/+/command" \
|
||||||
|
-t "infoscreen/+/command/ack" \
|
||||||
|
-v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected:
|
||||||
|
- Command publish appears on `infoscreen/{uuid}/commands`.
|
||||||
|
- ACK(s) appear on `infoscreen/{uuid}/commands/ack`.
|
||||||
|
|
||||||
|
## 2) Login and Keep Session Cookie
|
||||||
|
|
||||||
|
```bash
|
||||||
|
API_BASE="http://127.0.0.1:8000"
|
||||||
|
USER="<admin_or_superadmin_username>"
|
||||||
|
PASS="<password>"
|
||||||
|
|
||||||
|
curl -sS -X POST "$API_BASE/api/auth/login" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"username\":\"$USER\",\"password\":\"$PASS\"}" \
|
||||||
|
-c /tmp/infoscreen-cookies.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected:
|
||||||
|
- Login success response.
|
||||||
|
- Cookie jar file created at `/tmp/infoscreen-cookies.txt`.
|
||||||
|
|
||||||
|
## 3) Pick Target Client UUID
|
||||||
|
|
||||||
|
Option A: Use known canary UUID.
|
||||||
|
|
||||||
|
Option B: query alive clients:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -sS "$API_BASE/api/clients/with_alive_status" -b /tmp/infoscreen-cookies.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Choose one `uuid` where `is_alive` is `true`.
|
||||||
|
|
||||||
|
## 4) Issue Restart Command
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CLIENT_UUID="<target_uuid>"
|
||||||
|
|
||||||
|
curl -sS -X POST "$API_BASE/api/clients/$CLIENT_UUID/restart" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-b /tmp/infoscreen-cookies.txt \
|
||||||
|
-d '{"reason":"canary_restart_validation"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected:
|
||||||
|
- HTTP `202` on success.
|
||||||
|
- JSON includes `command.commandId` and initial status around `published`.
|
||||||
|
- In MQTT monitor, a command payload with:
|
||||||
|
- `schema_version: "1.0"`
|
||||||
|
- `action: "reboot_host"`
|
||||||
|
- matching `command_id`.
|
||||||
|
|
||||||
|
## 5) Poll Command Lifecycle Until Terminal
|
||||||
|
|
||||||
|
```bash
|
||||||
|
COMMAND_ID="<command_id_from_previous_step>"
|
||||||
|
|
||||||
|
for i in $(seq 1 20); do
|
||||||
|
curl -sS "$API_BASE/api/clients/commands/$COMMAND_ID" -b /tmp/infoscreen-cookies.txt
|
||||||
|
echo
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected status progression (typical):
|
||||||
|
- `queued` -> `publish_in_progress` -> `published` -> `ack_received` -> `execution_started` -> `completed`
|
||||||
|
|
||||||
|
Failure/alternate terminal states:
|
||||||
|
- `failed` (check `errorCode` / `errorMessage`)
|
||||||
|
- `blocked_safety` (reboot lockout triggered)
|
||||||
|
|
||||||
|
## 6) Validate Offline/Timeout Behavior
|
||||||
|
|
||||||
|
- Repeat step 4 for an offline client (or stop client process first).
|
||||||
|
- Confirm command does not falsely end as `completed`.
|
||||||
|
- Confirm status remains non-success and has usable failure diagnostics.
|
||||||
|
|
||||||
|
## 7) Validate Safety Lockout
|
||||||
|
|
||||||
|
Current lockout in API route:
|
||||||
|
- Threshold: 3 reboot commands
|
||||||
|
- Window: 15 minutes
|
||||||
|
|
||||||
|
Test:
|
||||||
|
- Send 4 restart commands quickly for same `uuid`.
|
||||||
|
|
||||||
|
Expected:
|
||||||
|
- One request returns HTTP `429`.
|
||||||
|
- Command entry state `blocked_safety` with lockout error details.
|
||||||
|
|
||||||
|
## 8) Service Log Spot Check
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose logs --tail=150 server listener mqtt
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected:
|
||||||
|
- No MQTT auth errors (`Not authorized`, `Connection Refused: not authorised`).
|
||||||
|
- Listener logs show ACK processing for `command_id`.
|
||||||
|
|
||||||
|
## 9) Acceptance Criteria
|
||||||
|
|
||||||
|
- Restart command publish is visible on MQTT.
|
||||||
|
- ACK is received and mapped by listener.
|
||||||
|
- Status endpoint reaches correct terminal state.
|
||||||
|
- Safety lockout works under repeated restart attempts.
|
||||||
|
- No auth regression in broker/service logs.
|
||||||
|
|
||||||
|
## Cleanup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rm -f /tmp/infoscreen-cookies.txt
|
||||||
|
```
|
||||||
@@ -5,6 +5,42 @@
|
|||||||
|
|
||||||
This changelog documents technical and developer-relevant changes included in public releases. For development workspace changes, see DEV-CHANGELOG.md. Not all changes here are reflected in the user-facing changelog (`program-info.json`), and not all UI/feature changes are repeated here. Some changes (e.g., backend refactoring, API adjustments, infrastructure, developer tooling, or internal logic) may only appear in TECH-CHANGELOG.md. For UI/feature changes, see `dashboard/public/program-info.json`.
|
This changelog documents technical and developer-relevant changes included in public releases. For development workspace changes, see DEV-CHANGELOG.md. Not all changes here are reflected in the user-facing changelog (`program-info.json`), and not all UI/feature changes are repeated here. Some changes (e.g., backend refactoring, API adjustments, infrastructure, developer tooling, or internal logic) may only appear in TECH-CHANGELOG.md. For UI/feature changes, see `dashboard/public/program-info.json`.
|
||||||
|
|
||||||
|
## Unreleased
|
||||||
|
- <20> **Crash detection, auto-recovery, and service_failed monitoring (2026-04-05)**:
|
||||||
|
- Added `GET /api/clients/crashed` endpoint: returns active clients with `process_status=crashed` or stale heartbeat beyond grace period, with `crash_reason` field.
|
||||||
|
- Added `restart_app` command action alongside existing `reboot_host`/`shutdown_host`; registered in `server/routes/clients.py` with same safety lockout.
|
||||||
|
- Scheduler: Added crash auto-recovery loop (feature-flagged via `CRASH_RECOVERY_ENABLED`): scans candidates via `get_crash_recovery_candidates()`, issues `reboot_host` command per client, publishes to primary + compat MQTT topics, updates command lifecycle.
|
||||||
|
- Scheduler: Added unconditional command expiry sweep each poll cycle via `sweep_expired_commands()` in `scheduler/db_utils.py`: marks non-terminal `ClientCommand` rows with `expires_at < now` as `expired`.
|
||||||
|
- Added `service_failed` topic ingestion in `listener/listener.py`: subscribe to `infoscreen/+/service_failed` on every connect; persist `service_failed_at` and `service_failed_unit` on Client; empty payload (retain clear) ignored.
|
||||||
|
- Added `broker_connection` block extraction in health payload handler: persists `mqtt_reconnect_count` and `mqtt_last_disconnect_at` from `infoscreen/{uuid}/health`.
|
||||||
|
- Added four new DB columns to `clients` table via migration `b1c2d3e4f5a6`: `service_failed_at`, `service_failed_unit`, `mqtt_reconnect_count`, `mqtt_last_disconnect_at`.
|
||||||
|
- Added `GET /api/clients/service_failed` endpoint: lists clients with `service_failed_at` set, ordered by event time desc.
|
||||||
|
- Added `POST /api/clients/<uuid>/clear_service_failed` endpoint: clears DB flag and publishes empty retained MQTT message to clear `infoscreen/{uuid}/service_failed`.
|
||||||
|
- Monitoring overview API (`GET /api/client-logs/monitoring-overview`) now includes `mqtt_reconnect_count` and `mqtt_last_disconnect_at` per client.
|
||||||
|
- Frontend: Added orange service-failed alert panel to monitoring page (hidden when empty, auto-refresh 15s, per-row Quittieren button with loading/success/error states).
|
||||||
|
- Frontend: Client detail panel in monitoring now shows MQTT reconnect count and last disconnect timestamp.
|
||||||
|
- Frontend: Added `ServiceFailedClient`, `ServiceFailedClientsResponse` types; `fetchServiceFailedClients()` and `clearServiceFailed()` API helpers in `dashboard/src/apiClients.ts`.
|
||||||
|
- Added `service_failed` topic contract to `MQTT_EVENT_PAYLOAD_GUIDE.md`.
|
||||||
|
- <20>🔐 **MQTT auth hardening for server-side services (2026-04-03)**:
|
||||||
|
- `listener/listener.py` now uses env-based broker connectivity for host/port and credentials (`MQTT_BROKER_HOST`, `MQTT_BROKER_PORT`, `MQTT_USER`, `MQTT_PASSWORD`) instead of anonymous fixed `mqtt:1883`.
|
||||||
|
- `scheduler/scheduler.py` now uses the same env-based MQTT auth path and optional TLS toggles (`MQTT_TLS_ENABLED`, `MQTT_TLS_CA_CERT`, `MQTT_TLS_CERTFILE`, `MQTT_TLS_KEYFILE`, `MQTT_TLS_INSECURE`).
|
||||||
|
- `docker-compose.yml` and `docker-compose.override.yml` now pass MQTT credentials into listener and scheduler containers for consistent authenticated connections.
|
||||||
|
- Mosquitto is now configured for authenticated access (`allow_anonymous false`, `password_file /mosquitto/config/passwd`) and bootstraps credentials from env at container startup.
|
||||||
|
- MQTT healthcheck publish now authenticates with configured broker credentials.
|
||||||
|
- 🔁 **Client command lifecycle foundation (restart/shutdown) (2026-04-03)**:
|
||||||
|
- Added persistent command tracking model `ClientCommand` in `models/models.py` and Alembic migration `aa12bb34cc56_add_client_commands_table.py`.
|
||||||
|
- Upgraded `POST /api/clients/<uuid>/restart` from fire-and-forget publish to lifecycle-aware command issuance with command metadata (`command_id`, `issued_at`, `expires_at`, `reason`, `requested_by`).
|
||||||
|
- Added `POST /api/clients/<uuid>/shutdown` endpoint with the same lifecycle contract.
|
||||||
|
- Added `GET /api/clients/commands/<command_id>` status endpoint for command-state polling.
|
||||||
|
- Added restart safety lockout in API path: max 3 restart commands per client in rolling 15 minutes, returning `blocked_safety` when threshold is exceeded.
|
||||||
|
- Added command MQTT publish to `infoscreen/{uuid}/commands` (QoS1, non-retained) and temporary legacy restart compatibility publish to `clients/{uuid}/restart`.
|
||||||
|
- Added temporary topic compatibility publish to `infoscreen/{uuid}/command` and listener acceptance of `infoscreen/{uuid}/command/ack` to bridge singular/plural naming assumptions.
|
||||||
|
- Canonicalized command payload action values to host-level semantics: `reboot_host` and `shutdown_host` (API routes remain `/restart` and `/shutdown` for operator UX compatibility).
|
||||||
|
- Added frozen payload validation snippets for integration/client tooling in `implementation-plans/reboot-command-payload-schemas.md` and `implementation-plans/reboot-command-payload-schemas.json`.
|
||||||
|
- Listener now subscribes to `infoscreen/{uuid}/commands/ack` and maps client acknowledgements into command lifecycle states (`ack_received`, `execution_started`, `completed`, `failed`).
|
||||||
|
- Initial lifecycle statuses implemented server-side: `queued`, `publish_in_progress`, `published`, `failed`, and `blocked_safety`.
|
||||||
|
- Frontend API helper extended in `dashboard/src/apiClients.ts` with `ClientCommand` typing plus command APIs for shutdown and status polling preparation.
|
||||||
|
|
||||||
## 2026.1.0-alpha.16 (2026-04-02)
|
## 2026.1.0-alpha.16 (2026-04-02)
|
||||||
- 🐛 **Dashboard holiday banner refactoring and state fix (`dashboard/src/dashboard.tsx`)**:
|
- 🐛 **Dashboard holiday banner refactoring and state fix (`dashboard/src/dashboard.tsx`)**:
|
||||||
- **Motivation — unstable fetch function:** `loadHolidayStatus` had `location.pathname` in its `useCallback` dependency array, causing a new function reference to be created on every navigation event. The `useEffect` depending on that reference then re-fired, producing overlapping API calls at mount that cancelled each other via the request-sequence guard, leaving the banner unresolved.
|
- **Motivation — unstable fetch function:** `loadHolidayStatus` had `location.pathname` in its `useCallback` dependency array, causing a new function reference to be created on every navigation event. The `useEffect` depending on that reference then re-fired, producing overlapping API calls at mount that cancelled each other via the request-sequence guard, leaving the banner unresolved.
|
||||||
|
|||||||
55
TODO.md
Normal file
55
TODO.md
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
# TODO
|
||||||
|
|
||||||
|
## MQTT TLS Hardening (Production)
|
||||||
|
|
||||||
|
- [ ] Enable TLS listener in `mosquitto/config/mosquitto.conf` (e.g., port 8883) while keeping 1883 only for temporary migration if needed.
|
||||||
|
- [ ] Generate and deploy server certificate + private key for Mosquitto (CA-signed or internal PKI).
|
||||||
|
- [ ] Add CA certificate distribution strategy for all clients and services (server, listener, scheduler, external monitors).
|
||||||
|
- [ ] Set strict file permissions for cert/key material (`chmod 600` for keys, least-privilege ownership).
|
||||||
|
- [ ] Update Docker Compose MQTT service to mount TLS cert/key/CA paths read-only.
|
||||||
|
- [ ] Add environment variables for TLS in `.env` / `.env.example`:
|
||||||
|
- `MQTT_TLS_ENABLED=true`
|
||||||
|
- `MQTT_TLS_CA_CERT=<path>`
|
||||||
|
- `MQTT_TLS_CERTFILE=<path>` (if mutual TLS used)
|
||||||
|
- `MQTT_TLS_KEYFILE=<path>` (if mutual TLS used)
|
||||||
|
- `MQTT_TLS_INSECURE=false`
|
||||||
|
- [ ] Switch internal services to TLS connection settings and verify authenticated reconnect behavior.
|
||||||
|
- [ ] Decide policy: TLS-only auth (username/password over TLS) vs mutual TLS + username/password.
|
||||||
|
- [ ] Disable non-TLS listener (1883) after all clients migrated.
|
||||||
|
- [ ] Restrict MQTT firewall ingress to trusted source ranges only.
|
||||||
|
- [ ] Add Mosquitto ACL file for topic-level permissions per role/client type.
|
||||||
|
- [ ] Add cert rotation process (renewal schedule, rollout, rollback steps).
|
||||||
|
- [ ] Add monitoring/alerting for certificate expiry and broker auth failures.
|
||||||
|
- [ ] Add runbook section for external monitoring clients (how to connect with CA validation).
|
||||||
|
- [ ] Perform a staged rollout (canary group first), then full migration.
|
||||||
|
- [ ] Document final TLS contract in `MQTT_EVENT_PAYLOAD_GUIDE.md` and deployment docs.
|
||||||
|
|
||||||
|
## Client Recovery Paths
|
||||||
|
|
||||||
|
### Path 1 — Software running → restart via MQTT ✅
|
||||||
|
- Server-side fully implemented (`restart_app` action, command lifecycle, monitoring panel).
|
||||||
|
- [ ] Client team: handle `restart_app` action in command handler (soft app restart, no reboot).
|
||||||
|
|
||||||
|
### Path 2 — Software crashed → MQTT unavailable
|
||||||
|
- Robust solution is **systemd `Restart=always`** (or `Restart=on-failure`) on the client device — no server involvement, OS init system restarts the process automatically.
|
||||||
|
- Server detects the crash via missing heartbeat (`process_status=crashed`), records it, and shows it in the monitoring panel. Recovery is confirmed when heartbeats resume.
|
||||||
|
- [ ] Client team: ensure the infoscreen service unit has `Restart=always` and `RestartSec=<delay>` configured in its systemd unit file.
|
||||||
|
- [ ] Evaluate whether MQTT `clean_session=False` + fixed `client_id` is worth adding for cases where the app crashes but the MQTT connection briefly survives (would allow QoS1 command delivery on reconnect).
|
||||||
|
- Note: the existing scheduler crash recovery (`reboot_host` via MQTT) is unreliable for a fully crashed app unless the client uses a persistent MQTT session. Revisit if client team enables `clean_session=False`.
|
||||||
|
|
||||||
|
### Path 3 — OS crashed / hung → power cycle needed (customer-dependent)
|
||||||
|
- No software-based recovery path is possible when the OS is unresponsive.
|
||||||
|
- Recovery requires external hardware intervention; options depend on customer infrastructure:
|
||||||
|
- Smart plug / PDU with API (e.g., Shelly, Tasmota, APC, Raritan)
|
||||||
|
- IPMI / iDRAC / BMC (server-class hardware)
|
||||||
|
- CEC power command from another device on the same HDMI chain
|
||||||
|
- Wake-on-LAN after a scheduled power-cut (limited applicability)
|
||||||
|
- [ ] Clarify with customer which hardware is available / acceptable.
|
||||||
|
- [ ] If a smart plug or PDU API is chosen: design a server-side "hard power cycle" command type and integration (out of scope until hardware is confirmed).
|
||||||
|
- [ ] Document chosen solution and integrate into monitoring runbook once decided.
|
||||||
|
|
||||||
|
## Optional Security Follow-ups
|
||||||
|
|
||||||
|
- [ ] Move MQTT credentials to Docker secrets or a vault-backed secret source.
|
||||||
|
- [ ] Rotate `MQTT_USER`/`MQTT_PASSWORD` on a fixed schedule.
|
||||||
|
- [ ] Add fail2ban/rate-limiting protections for exposed broker ports.
|
||||||
@@ -2,9 +2,9 @@
|
|||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/png" href="/favicon.png" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>Vite + React + TS</title>
|
<title>Infoscreen</title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="root"></div>
|
<div id="root"></div>
|
||||||
|
|||||||
BIN
dashboard/public/favicon.png
Normal file
BIN
dashboard/public/favicon.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 225 KiB |
@@ -25,10 +25,6 @@
|
|||||||
{ "name": "Alembic", "license": "MIT" }
|
{ "name": "Alembic", "license": "MIT" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"buildInfo": {
|
|
||||||
"buildDate": "2025-12-29T12:00:00Z",
|
|
||||||
"commitId": "9f2ae8b44c3a"
|
|
||||||
},
|
|
||||||
"changelog": [
|
"changelog": [
|
||||||
{
|
{
|
||||||
"version": "2026.1.0-alpha.16",
|
"version": "2026.1.0-alpha.16",
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ export interface MonitoringClient {
|
|||||||
};
|
};
|
||||||
latestLog?: MonitoringLogEntry | null;
|
latestLog?: MonitoringLogEntry | null;
|
||||||
latestError?: MonitoringLogEntry | null;
|
latestError?: MonitoringLogEntry | null;
|
||||||
|
mqttReconnectCount?: number | null;
|
||||||
|
mqttLastDisconnectAt?: string | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface MonitoringOverview {
|
export interface MonitoringOverview {
|
||||||
|
|||||||
@@ -24,6 +24,62 @@ export interface Group {
|
|||||||
is_active?: boolean;
|
is_active?: boolean;
|
||||||
clients: Client[];
|
clients: Client[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface CrashedClient {
|
||||||
|
uuid: string;
|
||||||
|
description?: string | null;
|
||||||
|
hostname?: string | null;
|
||||||
|
ip?: string | null;
|
||||||
|
group_id?: number | null;
|
||||||
|
is_alive: boolean;
|
||||||
|
process_status?: string | null;
|
||||||
|
screen_health_status?: string | null;
|
||||||
|
last_alive?: string | null;
|
||||||
|
crash_reason: 'process_crashed' | 'heartbeat_stale';
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface CrashedClientsResponse {
|
||||||
|
crashed_count: number;
|
||||||
|
grace_period_seconds: number;
|
||||||
|
clients: CrashedClient[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ServiceFailedClient {
|
||||||
|
uuid: string;
|
||||||
|
description?: string | null;
|
||||||
|
hostname?: string | null;
|
||||||
|
ip?: string | null;
|
||||||
|
group_id?: number | null;
|
||||||
|
is_alive: boolean;
|
||||||
|
last_alive?: string | null;
|
||||||
|
service_failed_at: string;
|
||||||
|
service_failed_unit?: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ServiceFailedClientsResponse {
|
||||||
|
service_failed_count: number;
|
||||||
|
clients: ServiceFailedClient[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ClientCommand {
|
||||||
|
commandId: string;
|
||||||
|
clientUuid: string;
|
||||||
|
action: 'reboot_host' | 'shutdown_host' | 'restart_app';
|
||||||
|
status: string;
|
||||||
|
reason?: string | null;
|
||||||
|
requestedBy?: number | null;
|
||||||
|
issuedAt?: string | null;
|
||||||
|
expiresAt?: string | null;
|
||||||
|
publishedAt?: string | null;
|
||||||
|
ackedAt?: string | null;
|
||||||
|
executionStartedAt?: string | null;
|
||||||
|
completedAt?: string | null;
|
||||||
|
failedAt?: string | null;
|
||||||
|
errorCode?: string | null;
|
||||||
|
errorMessage?: string | null;
|
||||||
|
createdAt?: string | null;
|
||||||
|
updatedAt?: string | null;
|
||||||
|
}
|
||||||
// Liefert alle Gruppen mit zugehörigen Clients
|
// Liefert alle Gruppen mit zugehörigen Clients
|
||||||
export async function fetchGroupsWithClients(): Promise<Group[]> {
|
export async function fetchGroupsWithClients(): Promise<Group[]> {
|
||||||
const response = await fetch('/api/groups/with_clients');
|
const response = await fetch('/api/groups/with_clients');
|
||||||
@@ -79,9 +135,11 @@ export async function updateClient(uuid: string, data: { description?: string; m
|
|||||||
return await res.json();
|
return await res.json();
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function restartClient(uuid: string): Promise<{ success: boolean; message?: string }> {
|
export async function restartClient(uuid: string, reason?: string): Promise<{ success: boolean; message?: string; command?: ClientCommand }> {
|
||||||
const response = await fetch(`/api/clients/${uuid}/restart`, {
|
const response = await fetch(`/api/clients/${uuid}/restart`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ reason: reason || null }),
|
||||||
});
|
});
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
const error = await response.json();
|
const error = await response.json();
|
||||||
@@ -90,6 +148,58 @@ export async function restartClient(uuid: string): Promise<{ success: boolean; m
|
|||||||
return await response.json();
|
return await response.json();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function shutdownClient(uuid: string, reason?: string): Promise<{ success: boolean; message?: string; command?: ClientCommand }> {
|
||||||
|
const response = await fetch(`/api/clients/${uuid}/shutdown`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ reason: reason || null }),
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = await response.json();
|
||||||
|
throw new Error(error.error || 'Fehler beim Herunterfahren des Clients');
|
||||||
|
}
|
||||||
|
return await response.json();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchClientCommandStatus(commandId: string): Promise<ClientCommand> {
|
||||||
|
const response = await fetch(`/api/clients/commands/${commandId}`);
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = await response.json();
|
||||||
|
throw new Error(error.error || 'Fehler beim Laden des Command-Status');
|
||||||
|
}
|
||||||
|
return await response.json();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchCrashedClients(): Promise<CrashedClientsResponse> {
|
||||||
|
const response = await fetch('/api/clients/crashed', { credentials: 'include' });
|
||||||
|
if (!response.ok) {
|
||||||
|
const err = await response.json().catch(() => ({}));
|
||||||
|
throw new Error(err.error || 'Fehler beim Laden der abgestürzten Clients');
|
||||||
|
}
|
||||||
|
return await response.json();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchServiceFailedClients(): Promise<ServiceFailedClientsResponse> {
|
||||||
|
const response = await fetch('/api/clients/service_failed', { credentials: 'include' });
|
||||||
|
if (!response.ok) {
|
||||||
|
const err = await response.json().catch(() => ({}));
|
||||||
|
throw new Error(err.error || 'Fehler beim Laden der service_failed Clients');
|
||||||
|
}
|
||||||
|
return await response.json();
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function clearServiceFailed(uuid: string): Promise<{ success: boolean; message?: string }> {
|
||||||
|
const response = await fetch(`/api/clients/${uuid}/clear_service_failed`, {
|
||||||
|
method: 'POST',
|
||||||
|
credentials: 'include',
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
const err = await response.json().catch(() => ({}));
|
||||||
|
throw new Error(err.error || 'Fehler beim Quittieren des service_failed Flags');
|
||||||
|
}
|
||||||
|
return await response.json();
|
||||||
|
}
|
||||||
|
|
||||||
export async function deleteClient(uuid: string) {
|
export async function deleteClient(uuid: string) {
|
||||||
const res = await fetch(`/api/clients/${uuid}`, {
|
const res = await fetch(`/api/clients/${uuid}`, {
|
||||||
method: 'DELETE',
|
method: 'DELETE',
|
||||||
|
|||||||
@@ -370,4 +370,49 @@
|
|||||||
.monitoring-log-dialog-actions {
|
.monitoring-log-dialog-actions {
|
||||||
padding: 0 0.2rem 0.4rem;
|
padding: 0 0.2rem 0.4rem;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Crash recovery panel */
|
||||||
|
.monitoring-crash-panel {
|
||||||
|
border-left: 4px solid #dc2626;
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.monitoring-service-failed-panel {
|
||||||
|
border-left: 4px solid #ea580c;
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.monitoring-crash-table {
|
||||||
|
width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
font-size: 0.875rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.monitoring-crash-table th {
|
||||||
|
text-align: left;
|
||||||
|
padding: 0.5rem 0.75rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #64748b;
|
||||||
|
border-bottom: 1px solid #e2e8f0;
|
||||||
|
background: #f8fafc;
|
||||||
|
}
|
||||||
|
|
||||||
|
.monitoring-crash-table td {
|
||||||
|
padding: 0.55rem 0.75rem;
|
||||||
|
border-bottom: 1px solid #f1f5f9;
|
||||||
|
vertical-align: middle;
|
||||||
|
}
|
||||||
|
|
||||||
|
.monitoring-crash-table tr:last-child td {
|
||||||
|
border-bottom: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.monitoring-crash-table tr:hover td {
|
||||||
|
background: #fef2f2;
|
||||||
|
}
|
||||||
|
|
||||||
|
.monitoring-meta-hint {
|
||||||
|
color: #94a3b8;
|
||||||
|
font-size: 0.8rem;
|
||||||
}
|
}
|
||||||
@@ -7,6 +7,16 @@ import {
|
|||||||
type MonitoringLogEntry,
|
type MonitoringLogEntry,
|
||||||
type MonitoringOverview,
|
type MonitoringOverview,
|
||||||
} from './apiClientMonitoring';
|
} from './apiClientMonitoring';
|
||||||
|
import {
|
||||||
|
fetchCrashedClients,
|
||||||
|
fetchServiceFailedClients,
|
||||||
|
clearServiceFailed,
|
||||||
|
restartClient,
|
||||||
|
type CrashedClient,
|
||||||
|
type CrashedClientsResponse,
|
||||||
|
type ServiceFailedClient,
|
||||||
|
type ServiceFailedClientsResponse,
|
||||||
|
} from './apiClients';
|
||||||
import { useAuth } from './useAuth';
|
import { useAuth } from './useAuth';
|
||||||
import { ButtonComponent } from '@syncfusion/ej2-react-buttons';
|
import { ButtonComponent } from '@syncfusion/ej2-react-buttons';
|
||||||
import { DropDownListComponent } from '@syncfusion/ej2-react-dropdowns';
|
import { DropDownListComponent } from '@syncfusion/ej2-react-dropdowns';
|
||||||
@@ -156,6 +166,12 @@ const MonitoringDashboard: React.FC = () => {
|
|||||||
const [screenshotErrored, setScreenshotErrored] = React.useState<boolean>(false);
|
const [screenshotErrored, setScreenshotErrored] = React.useState<boolean>(false);
|
||||||
const selectedClientUuidRef = React.useRef<string | null>(null);
|
const selectedClientUuidRef = React.useRef<string | null>(null);
|
||||||
const [selectedLogEntry, setSelectedLogEntry] = React.useState<MonitoringLogEntry | null>(null);
|
const [selectedLogEntry, setSelectedLogEntry] = React.useState<MonitoringLogEntry | null>(null);
|
||||||
|
const [crashedClients, setCrashedClients] = React.useState<CrashedClientsResponse | null>(null);
|
||||||
|
const [restartStates, setRestartStates] = React.useState<Record<string, 'idle' | 'loading' | 'success' | 'failed'>>({});
|
||||||
|
const [restartErrors, setRestartErrors] = React.useState<Record<string, string>>({});
|
||||||
|
const [serviceFailedClients, setServiceFailedClients] = React.useState<ServiceFailedClientsResponse | null>(null);
|
||||||
|
const [clearStates, setClearStates] = React.useState<Record<string, 'idle' | 'loading' | 'success' | 'failed'>>({});
|
||||||
|
const [clearErrors, setClearErrors] = React.useState<Record<string, string>>({});
|
||||||
|
|
||||||
const selectedClient = React.useMemo<MonitoringClient | null>(() => {
|
const selectedClient = React.useMemo<MonitoringClient | null>(() => {
|
||||||
if (!overview || !selectedClientUuid) return null;
|
if (!overview || !selectedClientUuid) return null;
|
||||||
@@ -197,9 +213,37 @@ const MonitoringDashboard: React.FC = () => {
|
|||||||
}
|
}
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
const loadCrashedClients = React.useCallback(async () => {
|
||||||
|
try {
|
||||||
|
const data = await fetchCrashedClients();
|
||||||
|
setCrashedClients(data);
|
||||||
|
} catch {
|
||||||
|
// non-fatal: crashes panel just stays stale
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const loadServiceFailedClients = React.useCallback(async () => {
|
||||||
|
try {
|
||||||
|
const data = await fetchServiceFailedClients();
|
||||||
|
setServiceFailedClients(data);
|
||||||
|
} catch {
|
||||||
|
// non-fatal
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
React.useEffect(() => {
|
React.useEffect(() => {
|
||||||
loadOverview(hours, false);
|
loadOverview(hours, false);
|
||||||
}, [hours, loadOverview]);
|
loadCrashedClients();
|
||||||
|
loadServiceFailedClients();
|
||||||
|
}, [hours, loadOverview, loadCrashedClients, loadServiceFailedClients]);
|
||||||
|
|
||||||
|
React.useEffect(() => {
|
||||||
|
const id = window.setInterval(() => {
|
||||||
|
loadCrashedClients();
|
||||||
|
loadServiceFailedClients();
|
||||||
|
}, REFRESH_INTERVAL_MS);
|
||||||
|
return () => window.clearInterval(id);
|
||||||
|
}, [loadCrashedClients, loadServiceFailedClients]);
|
||||||
|
|
||||||
React.useEffect(() => {
|
React.useEffect(() => {
|
||||||
const hasActivePriorityScreenshots = (overview?.summary.activePriorityScreenshots || 0) > 0;
|
const hasActivePriorityScreenshots = (overview?.summary.activePriorityScreenshots || 0) > 0;
|
||||||
@@ -308,6 +352,194 @@ const MonitoringDashboard: React.FC = () => {
|
|||||||
{renderMetricCard('Fehler-Logs', overview?.summary.errorLogs || 0, 'Im gewählten Zeitraum', '#b91c1c')}
|
{renderMetricCard('Fehler-Logs', overview?.summary.errorLogs || 0, 'Im gewählten Zeitraum', '#b91c1c')}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{crashedClients && crashedClients.crashed_count > 0 && (
|
||||||
|
<div className="monitoring-panel monitoring-crash-panel">
|
||||||
|
<div className="monitoring-panel-header">
|
||||||
|
<h3 style={{ color: '#dc2626' }}>
|
||||||
|
Abgestürzte / Nicht erreichbare Clients
|
||||||
|
</h3>
|
||||||
|
<span
|
||||||
|
style={{
|
||||||
|
background: '#fee2e2',
|
||||||
|
color: '#991b1b',
|
||||||
|
padding: '2px 10px',
|
||||||
|
borderRadius: '12px',
|
||||||
|
fontWeight: 600,
|
||||||
|
fontSize: '0.85rem',
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{crashedClients.crashed_count}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<table className="monitoring-crash-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Client</th>
|
||||||
|
<th>Gruppe</th>
|
||||||
|
<th>Ursache</th>
|
||||||
|
<th>Prozessstatus</th>
|
||||||
|
<th>Letztes Signal</th>
|
||||||
|
<th>Aktion</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{crashedClients.clients.map((c: CrashedClient) => {
|
||||||
|
const state = restartStates[c.uuid] || 'idle';
|
||||||
|
const errMsg = restartErrors[c.uuid];
|
||||||
|
const displayName = c.description || c.hostname || c.uuid;
|
||||||
|
return (
|
||||||
|
<tr key={c.uuid}>
|
||||||
|
<td>
|
||||||
|
<span title={c.uuid}>{displayName}</span>
|
||||||
|
{c.ip && <span className="monitoring-meta-hint"> ({c.ip})</span>}
|
||||||
|
</td>
|
||||||
|
<td>{c.group_id ?? '—'}</td>
|
||||||
|
<td>
|
||||||
|
<span
|
||||||
|
className="monitoring-status-badge"
|
||||||
|
style={
|
||||||
|
c.crash_reason === 'process_crashed'
|
||||||
|
? { color: '#991b1b', backgroundColor: '#fee2e2' }
|
||||||
|
: { color: '#78350f', backgroundColor: '#fef3c7' }
|
||||||
|
}
|
||||||
|
>
|
||||||
|
{c.crash_reason === 'process_crashed' ? 'Prozess abgestürzt' : 'Heartbeat veraltet'}
|
||||||
|
</span>
|
||||||
|
</td>
|
||||||
|
<td>{c.process_status || '—'}</td>
|
||||||
|
<td>{formatRelative(c.last_alive)}</td>
|
||||||
|
<td>
|
||||||
|
{state === 'loading' && <span style={{ color: '#6b7280', fontSize: '0.85rem' }}>Wird gesendet…</span>}
|
||||||
|
{state === 'success' && <span style={{ color: '#15803d', fontSize: '0.85rem' }}>✓ Neustart gesendet</span>}
|
||||||
|
{state === 'failed' && (
|
||||||
|
<span style={{ color: '#dc2626', fontSize: '0.85rem' }} title={errMsg}>
|
||||||
|
✗ Fehler
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
{(state === 'idle' || state === 'failed') && (
|
||||||
|
<ButtonComponent
|
||||||
|
cssClass="e-small e-danger"
|
||||||
|
disabled={state === 'loading'}
|
||||||
|
onClick={async () => {
|
||||||
|
setRestartStates(prev => ({ ...prev, [c.uuid]: 'loading' }));
|
||||||
|
setRestartErrors(prev => { const n = { ...prev }; delete n[c.uuid]; return n; });
|
||||||
|
try {
|
||||||
|
await restartClient(c.uuid, c.crash_reason);
|
||||||
|
setRestartStates(prev => ({ ...prev, [c.uuid]: 'success' }));
|
||||||
|
setTimeout(() => {
|
||||||
|
setRestartStates(prev => ({ ...prev, [c.uuid]: 'idle' }));
|
||||||
|
loadCrashedClients();
|
||||||
|
}, 8000);
|
||||||
|
} catch (e) {
|
||||||
|
const msg = e instanceof Error ? e.message : 'Unbekannter Fehler';
|
||||||
|
setRestartStates(prev => ({ ...prev, [c.uuid]: 'failed' }));
|
||||||
|
setRestartErrors(prev => ({ ...prev, [c.uuid]: msg }));
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Neustart
|
||||||
|
</ButtonComponent>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{serviceFailedClients && serviceFailedClients.service_failed_count > 0 && (
|
||||||
|
<div className="monitoring-panel monitoring-service-failed-panel">
|
||||||
|
<div className="monitoring-panel-header">
|
||||||
|
<h3 style={{ color: '#7c2d12' }}>
|
||||||
|
Service dauerhaft ausgefallen (systemd hat aufgegeben)
|
||||||
|
</h3>
|
||||||
|
<span
|
||||||
|
style={{
|
||||||
|
background: '#ffedd5',
|
||||||
|
color: '#7c2d12',
|
||||||
|
padding: '2px 10px',
|
||||||
|
borderRadius: '12px',
|
||||||
|
fontWeight: 600,
|
||||||
|
fontSize: '0.85rem',
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{serviceFailedClients.service_failed_count}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<p className="monitoring-meta-hint" style={{ marginBottom: '0.75rem' }}>
|
||||||
|
Diese Clients konnten von systemd nicht mehr automatisch neugestartet werden.
|
||||||
|
Manuelle Intervention erforderlich. Nach Behebung bitte quittieren.
|
||||||
|
</p>
|
||||||
|
<table className="monitoring-crash-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Client</th>
|
||||||
|
<th>Gruppe</th>
|
||||||
|
<th>Unit</th>
|
||||||
|
<th>Ausgefallen am</th>
|
||||||
|
<th>Letztes Signal</th>
|
||||||
|
<th>Aktion</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{serviceFailedClients.clients.map((c: ServiceFailedClient) => {
|
||||||
|
const state = clearStates[c.uuid] || 'idle';
|
||||||
|
const errMsg = clearErrors[c.uuid];
|
||||||
|
const displayName = c.description || c.hostname || c.uuid;
|
||||||
|
const failedAt = c.service_failed_at
|
||||||
|
? new Date(c.service_failed_at.endsWith('Z') ? c.service_failed_at : c.service_failed_at + 'Z').toLocaleString('de-DE')
|
||||||
|
: '—';
|
||||||
|
return (
|
||||||
|
<tr key={c.uuid}>
|
||||||
|
<td>
|
||||||
|
<span title={c.uuid}>{displayName}</span>
|
||||||
|
{c.ip && <span className="monitoring-meta-hint"> ({c.ip})</span>}
|
||||||
|
</td>
|
||||||
|
<td>{c.group_id ?? '—'}</td>
|
||||||
|
<td><code style={{ fontSize: '0.8rem' }}>{c.service_failed_unit || '—'}</code></td>
|
||||||
|
<td>{failedAt}</td>
|
||||||
|
<td>{formatRelative(c.last_alive)}</td>
|
||||||
|
<td>
|
||||||
|
{state === 'loading' && <span style={{ color: '#6b7280', fontSize: '0.85rem' }}>Wird quittiert…</span>}
|
||||||
|
{state === 'success' && <span style={{ color: '#15803d', fontSize: '0.85rem' }}>✓ Quittiert</span>}
|
||||||
|
{state === 'failed' && (
|
||||||
|
<span style={{ color: '#dc2626', fontSize: '0.85rem' }} title={errMsg}>✗ Fehler</span>
|
||||||
|
)}
|
||||||
|
{(state === 'idle' || state === 'failed') && (
|
||||||
|
<ButtonComponent
|
||||||
|
cssClass="e-small e-warning"
|
||||||
|
disabled={state === 'loading'}
|
||||||
|
onClick={async () => {
|
||||||
|
setClearStates(prev => ({ ...prev, [c.uuid]: 'loading' }));
|
||||||
|
setClearErrors(prev => { const n = { ...prev }; delete n[c.uuid]; return n; });
|
||||||
|
try {
|
||||||
|
await clearServiceFailed(c.uuid);
|
||||||
|
setClearStates(prev => ({ ...prev, [c.uuid]: 'success' }));
|
||||||
|
setTimeout(() => {
|
||||||
|
setClearStates(prev => ({ ...prev, [c.uuid]: 'idle' }));
|
||||||
|
loadServiceFailedClients();
|
||||||
|
}, 4000);
|
||||||
|
} catch (e) {
|
||||||
|
const msg = e instanceof Error ? e.message : 'Unbekannter Fehler';
|
||||||
|
setClearStates(prev => ({ ...prev, [c.uuid]: 'failed' }));
|
||||||
|
setClearErrors(prev => ({ ...prev, [c.uuid]: msg }));
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Quittieren
|
||||||
|
</ButtonComponent>
|
||||||
|
)}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{loading && !overview ? (
|
{loading && !overview ? (
|
||||||
<MessageComponent severity="Info" content="Monitoring-Daten werden geladen ..." />
|
<MessageComponent severity="Info" content="Monitoring-Daten werden geladen ..." />
|
||||||
) : (
|
) : (
|
||||||
@@ -393,6 +625,16 @@ const MonitoringDashboard: React.FC = () => {
|
|||||||
<span>Bildschirmstatus</span>
|
<span>Bildschirmstatus</span>
|
||||||
<strong>{selectedClient.screenHealthStatus || 'UNKNOWN'}</strong>
|
<strong>{selectedClient.screenHealthStatus || 'UNKNOWN'}</strong>
|
||||||
</div>
|
</div>
|
||||||
|
<div className="monitoring-detail-row">
|
||||||
|
<span>MQTT Reconnects</span>
|
||||||
|
<strong>{selectedClient.mqttReconnectCount != null ? selectedClient.mqttReconnectCount : '—'}</strong>
|
||||||
|
</div>
|
||||||
|
{selectedClient.mqttLastDisconnectAt && (
|
||||||
|
<div className="monitoring-detail-row">
|
||||||
|
<span>Letzter Disconnect</span>
|
||||||
|
<strong>{formatTimestamp(selectedClient.mqttLastDisconnectAt)}</strong>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
<div className="monitoring-detail-row">
|
<div className="monitoring-detail-row">
|
||||||
<span>Letzte Analyse</span>
|
<span>Letzte Analyse</span>
|
||||||
<strong>{formatTimestamp(selectedClient.lastScreenshotAnalyzed)}</strong>
|
<strong>{formatTimestamp(selectedClient.lastScreenshotAnalyzed)}</strong>
|
||||||
|
|||||||
@@ -12,10 +12,6 @@ interface ProgramInfo {
|
|||||||
frontend: { name: string; license: string }[];
|
frontend: { name: string; license: string }[];
|
||||||
backend: { name: string; license: string }[];
|
backend: { name: string; license: string }[];
|
||||||
};
|
};
|
||||||
buildInfo: {
|
|
||||||
buildDate: string;
|
|
||||||
commitId: string;
|
|
||||||
};
|
|
||||||
changelog: {
|
changelog: {
|
||||||
version: string;
|
version: string;
|
||||||
date: string;
|
date: string;
|
||||||
@@ -85,30 +81,30 @@ const Programminfo: React.FC = () => {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div className="e-card-content">
|
<div className="e-card-content">
|
||||||
<div style={{ display: 'flex', flexDirection: 'column', gap: '0.5rem' }}>
|
<div style={{ display: 'flex', flexDirection: 'column', gap: '0.25rem' }}>
|
||||||
<p>
|
<div><strong>Version:</strong> {info.version}</div>
|
||||||
<strong>Version:</strong> {info.version}
|
<div><strong>Copyright:</strong> {info.copyright}</div>
|
||||||
</p>
|
<div>
|
||||||
<p>
|
|
||||||
<strong>Copyright:</strong> {info.copyright}
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
<strong>Support:</strong>{' '}
|
<strong>Support:</strong>{' '}
|
||||||
<a href={`mailto:${info.supportContact}`} style={{ color: '#2563eb', textDecoration: 'none' }}>
|
<a href={`mailto:${info.supportContact}`} style={{ color: '#2563eb', textDecoration: 'none' }}>
|
||||||
{info.supportContact}
|
{info.supportContact}
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</div>
|
||||||
<hr style={{ margin: '1rem 0' }} />
|
<hr style={{ margin: '0.5rem 0' }} />
|
||||||
<h4 style={{ fontWeight: 600 }}>Build-Informationen</h4>
|
<div style={{ fontWeight: 600, fontSize: '0.875rem', marginBottom: '0.125rem' }}>Build-Informationen</div>
|
||||||
<p>
|
<div><strong>Build-Datum:</strong> {new Date(__BUILD_DATE__).toLocaleString('de-DE')}</div>
|
||||||
<strong>Build-Datum:</strong> {new Date(info.buildInfo.buildDate).toLocaleString('de-DE')}
|
<div>
|
||||||
</p>
|
<strong>Umgebung:</strong>{' '}
|
||||||
<p>
|
|
||||||
<strong>Commit-ID:</strong>{' '}
|
|
||||||
<span style={{ fontFamily: monoFont, fontSize: '0.875rem', background: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem' }}>
|
<span style={{ fontFamily: monoFont, fontSize: '0.875rem', background: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem' }}>
|
||||||
{info.buildInfo.commitId}
|
{__BUILD_ENV__}
|
||||||
</span>
|
</span>
|
||||||
</p>
|
</div>
|
||||||
|
<div>
|
||||||
|
<strong>Node.js:</strong>{' '}
|
||||||
|
<span style={{ fontFamily: monoFont, fontSize: '0.875rem', background: '#f3f4f6', padding: '0.125rem 0.25rem', borderRadius: '0.25rem' }}>
|
||||||
|
{__NODE_VERSION__}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
4
dashboard/src/vite-env.d.ts
vendored
4
dashboard/src/vite-env.d.ts
vendored
@@ -1 +1,5 @@
|
|||||||
/// <reference types="vite/client" />
|
/// <reference types="vite/client" />
|
||||||
|
|
||||||
|
declare const __BUILD_DATE__: string;
|
||||||
|
declare const __NODE_VERSION__: string;
|
||||||
|
declare const __BUILD_ENV__: string;
|
||||||
|
|||||||
@@ -6,6 +6,11 @@ import react from '@vitejs/plugin-react';
|
|||||||
export default defineConfig({
|
export default defineConfig({
|
||||||
cacheDir: './.vite',
|
cacheDir: './.vite',
|
||||||
plugins: [react()],
|
plugins: [react()],
|
||||||
|
define: {
|
||||||
|
__BUILD_DATE__: JSON.stringify(new Date().toISOString()),
|
||||||
|
__NODE_VERSION__: JSON.stringify(process.version),
|
||||||
|
__BUILD_ENV__: JSON.stringify(process.env.NODE_ENV ?? 'development'),
|
||||||
|
},
|
||||||
resolve: {
|
resolve: {
|
||||||
// 🔧 KORRIGIERT: Entferne die problematischen Aliases komplett
|
// 🔧 KORRIGIERT: Entferne die problematischen Aliases komplett
|
||||||
// Diese verursachen das "not an absolute path" Problem
|
// Diese verursachen das "not an absolute path" Problem
|
||||||
|
|||||||
@@ -45,15 +45,37 @@ services:
|
|||||||
image: eclipse-mosquitto:2.0.21
|
image: eclipse-mosquitto:2.0.21
|
||||||
container_name: infoscreen-mqtt
|
container_name: infoscreen-mqtt
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
command: >
|
||||||
|
sh -c 'set -eu;
|
||||||
|
: "$${MQTT_USER:?MQTT_USER not set}";
|
||||||
|
: "$${MQTT_PASSWORD:?MQTT_PASSWORD not set}";
|
||||||
|
touch /mosquitto/config/passwd;
|
||||||
|
chmod 600 /mosquitto/config/passwd;
|
||||||
|
mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_USER}" "$${MQTT_PASSWORD}";
|
||||||
|
if [ -n "$${MQTT_CANARY_USER:-}" ] && [ -n "$${MQTT_CANARY_PASSWORD:-}" ]; then
|
||||||
|
mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_CANARY_USER}" "$${MQTT_CANARY_PASSWORD}";
|
||||||
|
fi;
|
||||||
|
exec mosquitto -c /mosquitto/config/mosquitto.conf'
|
||||||
volumes:
|
volumes:
|
||||||
- ./mosquitto/config/mosquitto.conf:/mosquitto/config/mosquitto.conf:ro
|
- ./mosquitto/config:/mosquitto/config
|
||||||
|
- ./mosquitto/data:/mosquitto/data
|
||||||
|
- ./mosquitto/log:/mosquitto/log
|
||||||
ports:
|
ports:
|
||||||
- "1883:1883"
|
- "1883:1883"
|
||||||
- "9001:9001"
|
- "9001:9001"
|
||||||
|
environment:
|
||||||
|
- MQTT_USER=${MQTT_USER}
|
||||||
|
- MQTT_PASSWORD=${MQTT_PASSWORD}
|
||||||
|
- MQTT_CANARY_USER=${MQTT_CANARY_USER:-}
|
||||||
|
- MQTT_CANARY_PASSWORD=${MQTT_CANARY_PASSWORD:-}
|
||||||
networks:
|
networks:
|
||||||
- infoscreen-net
|
- infoscreen-net
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "mosquitto_pub -h localhost -t test -m 'health' || exit 1"]
|
test:
|
||||||
|
[
|
||||||
|
"CMD-SHELL",
|
||||||
|
"mosquitto_pub -h localhost -u $$MQTT_USER -P $$MQTT_PASSWORD -t test -m 'health' || exit 1",
|
||||||
|
]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 3
|
retries: 3
|
||||||
@@ -125,6 +147,11 @@ services:
|
|||||||
DB_PASSWORD: ${DB_PASSWORD}
|
DB_PASSWORD: ${DB_PASSWORD}
|
||||||
DB_NAME: ${DB_NAME}
|
DB_NAME: ${DB_NAME}
|
||||||
DB_ROOT_PASSWORD: ${DB_ROOT_PASSWORD}
|
DB_ROOT_PASSWORD: ${DB_ROOT_PASSWORD}
|
||||||
|
API_BASE_URL: http://server:8000
|
||||||
|
MQTT_BROKER_HOST: ${MQTT_BROKER_HOST:-mqtt}
|
||||||
|
MQTT_BROKER_PORT: ${MQTT_BROKER_PORT:-1883}
|
||||||
|
MQTT_USER: ${MQTT_USER}
|
||||||
|
MQTT_PASSWORD: ${MQTT_PASSWORD}
|
||||||
networks:
|
networks:
|
||||||
- infoscreen-net
|
- infoscreen-net
|
||||||
|
|
||||||
@@ -141,7 +168,18 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
# HINZUGEFÜGT: Datenbank-Verbindungsstring
|
# HINZUGEFÜGT: Datenbank-Verbindungsstring
|
||||||
DB_CONN: "mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}"
|
DB_CONN: "mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}"
|
||||||
MQTT_PORT: 1883
|
MQTT_BROKER_HOST: ${MQTT_BROKER_HOST:-mqtt}
|
||||||
|
MQTT_BROKER_PORT: ${MQTT_BROKER_PORT:-1883}
|
||||||
|
MQTT_USER: ${MQTT_USER}
|
||||||
|
MQTT_PASSWORD: ${MQTT_PASSWORD}
|
||||||
|
POLL_INTERVAL_SECONDS: ${POLL_INTERVAL_SECONDS:-30}
|
||||||
|
POWER_INTENT_PUBLISH_ENABLED: ${POWER_INTENT_PUBLISH_ENABLED:-false}
|
||||||
|
POWER_INTENT_HEARTBEAT_ENABLED: ${POWER_INTENT_HEARTBEAT_ENABLED:-true}
|
||||||
|
POWER_INTENT_EXPIRY_MULTIPLIER: ${POWER_INTENT_EXPIRY_MULTIPLIER:-3}
|
||||||
|
POWER_INTENT_MIN_EXPIRY_SECONDS: ${POWER_INTENT_MIN_EXPIRY_SECONDS:-90}
|
||||||
|
CRASH_RECOVERY_ENABLED: ${CRASH_RECOVERY_ENABLED:-false}
|
||||||
|
CRASH_RECOVERY_GRACE_SECONDS: ${CRASH_RECOVERY_GRACE_SECONDS:-180}
|
||||||
|
CRASH_RECOVERY_LOCKOUT_MINUTES: ${CRASH_RECOVERY_LOCKOUT_MINUTES:-15}
|
||||||
networks:
|
networks:
|
||||||
- infoscreen-net
|
- infoscreen-net
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,10 @@ services:
|
|||||||
- DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}
|
- DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}
|
||||||
- DB_URL=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}
|
- DB_URL=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}
|
||||||
- API_BASE_URL=http://server:8000
|
- API_BASE_URL=http://server:8000
|
||||||
|
- MQTT_BROKER_HOST=${MQTT_BROKER_HOST:-mqtt}
|
||||||
|
- MQTT_BROKER_PORT=${MQTT_BROKER_PORT:-1883}
|
||||||
|
- MQTT_USER=${MQTT_USER}
|
||||||
|
- MQTT_PASSWORD=${MQTT_PASSWORD}
|
||||||
- ENV=${ENV:-development}
|
- ENV=${ENV:-development}
|
||||||
- FLASK_SECRET_KEY=${FLASK_SECRET_KEY:-dev-secret-key-change-in-production}
|
- FLASK_SECRET_KEY=${FLASK_SECRET_KEY:-dev-secret-key-change-in-production}
|
||||||
- DEFAULT_SUPERADMIN_USERNAME=${DEFAULT_SUPERADMIN_USERNAME:-superadmin}
|
- DEFAULT_SUPERADMIN_USERNAME=${DEFAULT_SUPERADMIN_USERNAME:-superadmin}
|
||||||
@@ -70,6 +74,17 @@ services:
|
|||||||
image: eclipse-mosquitto:2.0.21 # ✅ GUT: Version ist bereits spezifisch
|
image: eclipse-mosquitto:2.0.21 # ✅ GUT: Version ist bereits spezifisch
|
||||||
container_name: infoscreen-mqtt
|
container_name: infoscreen-mqtt
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
command: >
|
||||||
|
sh -c 'set -eu;
|
||||||
|
: "$${MQTT_USER:?MQTT_USER not set}";
|
||||||
|
: "$${MQTT_PASSWORD:?MQTT_PASSWORD not set}";
|
||||||
|
touch /mosquitto/config/passwd;
|
||||||
|
chmod 600 /mosquitto/config/passwd;
|
||||||
|
mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_USER}" "$${MQTT_PASSWORD}";
|
||||||
|
if [ -n "$${MQTT_CANARY_USER:-}" ] && [ -n "$${MQTT_CANARY_PASSWORD:-}" ]; then
|
||||||
|
mosquitto_passwd -b /mosquitto/config/passwd "$${MQTT_CANARY_USER}" "$${MQTT_CANARY_PASSWORD}";
|
||||||
|
fi;
|
||||||
|
exec mosquitto -c /mosquitto/config/mosquitto.conf'
|
||||||
volumes:
|
volumes:
|
||||||
- ./mosquitto/config:/mosquitto/config
|
- ./mosquitto/config:/mosquitto/config
|
||||||
- ./mosquitto/data:/mosquitto/data
|
- ./mosquitto/data:/mosquitto/data
|
||||||
@@ -77,13 +92,18 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "1883:1883" # Standard MQTT
|
- "1883:1883" # Standard MQTT
|
||||||
- "9001:9001" # WebSocket (falls benötigt)
|
- "9001:9001" # WebSocket (falls benötigt)
|
||||||
|
environment:
|
||||||
|
- MQTT_USER=${MQTT_USER}
|
||||||
|
- MQTT_PASSWORD=${MQTT_PASSWORD}
|
||||||
|
- MQTT_CANARY_USER=${MQTT_CANARY_USER:-}
|
||||||
|
- MQTT_CANARY_PASSWORD=${MQTT_CANARY_PASSWORD:-}
|
||||||
networks:
|
networks:
|
||||||
- infoscreen-net
|
- infoscreen-net
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test:
|
test:
|
||||||
[
|
[
|
||||||
"CMD-SHELL",
|
"CMD-SHELL",
|
||||||
"mosquitto_pub -h localhost -t test -m 'health' || exit 1",
|
"mosquitto_pub -h localhost -u $$MQTT_USER -P $$MQTT_PASSWORD -t test -m 'health' || exit 1",
|
||||||
]
|
]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
@@ -169,13 +189,18 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
# HINZUGEFÜGT: Datenbank-Verbindungsstring
|
# HINZUGEFÜGT: Datenbank-Verbindungsstring
|
||||||
- DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}
|
- DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}
|
||||||
- MQTT_BROKER_URL=mqtt
|
- MQTT_BROKER_HOST=${MQTT_BROKER_HOST:-mqtt}
|
||||||
- MQTT_PORT=1883
|
- MQTT_BROKER_PORT=${MQTT_BROKER_PORT:-1883}
|
||||||
|
- MQTT_USER=${MQTT_USER}
|
||||||
|
- MQTT_PASSWORD=${MQTT_PASSWORD}
|
||||||
- POLL_INTERVAL_SECONDS=${POLL_INTERVAL_SECONDS:-30}
|
- POLL_INTERVAL_SECONDS=${POLL_INTERVAL_SECONDS:-30}
|
||||||
- POWER_INTENT_PUBLISH_ENABLED=${POWER_INTENT_PUBLISH_ENABLED:-false}
|
- POWER_INTENT_PUBLISH_ENABLED=${POWER_INTENT_PUBLISH_ENABLED:-false}
|
||||||
- POWER_INTENT_HEARTBEAT_ENABLED=${POWER_INTENT_HEARTBEAT_ENABLED:-true}
|
- POWER_INTENT_HEARTBEAT_ENABLED=${POWER_INTENT_HEARTBEAT_ENABLED:-true}
|
||||||
- POWER_INTENT_EXPIRY_MULTIPLIER=${POWER_INTENT_EXPIRY_MULTIPLIER:-3}
|
- POWER_INTENT_EXPIRY_MULTIPLIER=${POWER_INTENT_EXPIRY_MULTIPLIER:-3}
|
||||||
- POWER_INTENT_MIN_EXPIRY_SECONDS=${POWER_INTENT_MIN_EXPIRY_SECONDS:-90}
|
- POWER_INTENT_MIN_EXPIRY_SECONDS=${POWER_INTENT_MIN_EXPIRY_SECONDS:-90}
|
||||||
|
- CRASH_RECOVERY_ENABLED=${CRASH_RECOVERY_ENABLED:-false}
|
||||||
|
- CRASH_RECOVERY_GRACE_SECONDS=${CRASH_RECOVERY_GRACE_SECONDS:-180}
|
||||||
|
- CRASH_RECOVERY_LOCKOUT_MINUTES=${CRASH_RECOVERY_LOCKOUT_MINUTES:-15}
|
||||||
networks:
|
networks:
|
||||||
- infoscreen-net
|
- infoscreen-net
|
||||||
|
|
||||||
|
|||||||
149
implementation-plans/reboot-command-payload-schemas.json
Normal file
149
implementation-plans/reboot-command-payload-schemas.json
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
{
|
||||||
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||||
|
"$id": "https://infoscreen.local/schemas/reboot-command-payload-schemas.json",
|
||||||
|
"title": "Infoscreen Reboot Command Payload Schemas",
|
||||||
|
"description": "Frozen v1 schemas for per-client command and command acknowledgement payloads.",
|
||||||
|
"$defs": {
|
||||||
|
"commandPayloadV1": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"schema_version",
|
||||||
|
"command_id",
|
||||||
|
"client_uuid",
|
||||||
|
"action",
|
||||||
|
"issued_at",
|
||||||
|
"expires_at",
|
||||||
|
"requested_by",
|
||||||
|
"reason"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"schema_version": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "1.0"
|
||||||
|
},
|
||||||
|
"command_id": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uuid"
|
||||||
|
},
|
||||||
|
"client_uuid": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uuid"
|
||||||
|
},
|
||||||
|
"action": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"reboot_host",
|
||||||
|
"shutdown_host"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"issued_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time"
|
||||||
|
},
|
||||||
|
"expires_at": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "date-time"
|
||||||
|
},
|
||||||
|
"requested_by": {
|
||||||
|
"type": [
|
||||||
|
"integer",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"minimum": 1
|
||||||
|
},
|
||||||
|
"reason": {
|
||||||
|
"type": [
|
||||||
|
"string",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"maxLength": 2000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"commandAckPayloadV1": {
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"command_id",
|
||||||
|
"status",
|
||||||
|
"error_code",
|
||||||
|
"error_message"
|
||||||
|
],
|
||||||
|
"properties": {
|
||||||
|
"command_id": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "uuid"
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": [
|
||||||
|
"accepted",
|
||||||
|
"execution_started",
|
||||||
|
"completed",
|
||||||
|
"failed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"error_code": {
|
||||||
|
"type": [
|
||||||
|
"string",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"maxLength": 128
|
||||||
|
},
|
||||||
|
"error_message": {
|
||||||
|
"type": [
|
||||||
|
"string",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"maxLength": 4000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"allOf": [
|
||||||
|
{
|
||||||
|
"if": {
|
||||||
|
"properties": {
|
||||||
|
"status": {
|
||||||
|
"const": "failed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"then": {
|
||||||
|
"properties": {
|
||||||
|
"error_code": {
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"error_message": {
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"examples": [
|
||||||
|
{
|
||||||
|
"commandPayloadV1": {
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": "2026-04-03T12:48:10Z",
|
||||||
|
"expires_at": "2026-04-03T12:52:10Z",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"commandAckPayloadV1": {
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"status": "execution_started",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
59
implementation-plans/reboot-command-payload-schemas.md
Normal file
59
implementation-plans/reboot-command-payload-schemas.md
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
## Reboot Command Payload Schema Snippets
|
||||||
|
|
||||||
|
This file provides copy-ready validation snippets for client and integration test helpers.
|
||||||
|
|
||||||
|
### Canonical Topics (v1)
|
||||||
|
1. Command topic: infoscreen/{client_uuid}/commands
|
||||||
|
2. Ack topic: infoscreen/{client_uuid}/commands/ack
|
||||||
|
|
||||||
|
### Transitional Compatibility Topics
|
||||||
|
1. Command topic alias: infoscreen/{client_uuid}/command
|
||||||
|
2. Ack topic alias: infoscreen/{client_uuid}/command/ack
|
||||||
|
|
||||||
|
### Canonical Action Values
|
||||||
|
1. reboot_host
|
||||||
|
2. shutdown_host
|
||||||
|
|
||||||
|
### Ack Status Values
|
||||||
|
1. accepted
|
||||||
|
2. execution_started
|
||||||
|
3. completed
|
||||||
|
4. failed
|
||||||
|
|
||||||
|
### JSON Schema Source
|
||||||
|
Use this file for machine validation:
|
||||||
|
1. implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### Minimal Command Schema Snippet
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": ["schema_version", "command_id", "client_uuid", "action", "issued_at", "expires_at", "requested_by", "reason"],
|
||||||
|
"properties": {
|
||||||
|
"schema_version": { "const": "1.0" },
|
||||||
|
"command_id": { "type": "string", "format": "uuid" },
|
||||||
|
"client_uuid": { "type": "string", "format": "uuid" },
|
||||||
|
"action": { "enum": ["reboot_host", "shutdown_host"] },
|
||||||
|
"issued_at": { "type": "string", "format": "date-time" },
|
||||||
|
"expires_at": { "type": "string", "format": "date-time" },
|
||||||
|
"requested_by": { "type": ["integer", "null"] },
|
||||||
|
"reason": { "type": ["string", "null"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Minimal Ack Schema Snippet
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": ["command_id", "status", "error_code", "error_message"],
|
||||||
|
"properties": {
|
||||||
|
"command_id": { "type": "string", "format": "uuid" },
|
||||||
|
"status": { "enum": ["accepted", "execution_started", "completed", "failed"] },
|
||||||
|
"error_code": { "type": ["string", "null"] },
|
||||||
|
"error_message": { "type": ["string", "null"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
## Client Team Implementation Spec (Raspberry Pi 5)
|
||||||
|
|
||||||
|
### Mission
|
||||||
|
Implement client-side command handling for reliable restart and shutdown with strict validation, idempotency, acknowledgements, and reboot recovery continuity.
|
||||||
|
|
||||||
|
### Ownership Boundaries
|
||||||
|
1. Client team owns command intake, execution, acknowledgement emission, and post-reboot continuity.
|
||||||
|
2. Platform team owns command issuance, lifecycle aggregation, and server-side escalation logic.
|
||||||
|
3. Client implementation must not assume managed PoE availability.
|
||||||
|
|
||||||
|
### Required Client Behaviors
|
||||||
|
|
||||||
|
### Frozen MQTT Topics and Schemas (v1)
|
||||||
|
1. Canonical command topic: infoscreen/{client_uuid}/commands.
|
||||||
|
2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack.
|
||||||
|
3. Transitional compatibility topics during migration:
|
||||||
|
- infoscreen/{client_uuid}/command
|
||||||
|
- infoscreen/{client_uuid}/command/ack
|
||||||
|
4. QoS policy: command QoS 1, ack QoS 1 recommended.
|
||||||
|
5. Retain policy: commands and acks are non-retained.
|
||||||
|
|
||||||
|
Frozen command payload schema:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": "2026-04-03T12:48:10Z",
|
||||||
|
"expires_at": "2026-04-03T12:52:10Z",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Frozen ack payload schema:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"status": "execution_started",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Allowed ack status values:
|
||||||
|
1. accepted
|
||||||
|
2. execution_started
|
||||||
|
3. completed
|
||||||
|
4. failed
|
||||||
|
|
||||||
|
Frozen command action values for v1:
|
||||||
|
1. reboot_host
|
||||||
|
2. shutdown_host
|
||||||
|
|
||||||
|
Reserved but not emitted by server in v1:
|
||||||
|
1. restart_service
|
||||||
|
|
||||||
|
Validation snippets for helper scripts:
|
||||||
|
1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md
|
||||||
|
2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### 1. Command Intake
|
||||||
|
1. Subscribe to the canonical command topic with QoS 1.
|
||||||
|
2. Parse required fields: schema_version, command_id, action, issued_at, expires_at, reason, requested_by, target metadata.
|
||||||
|
3. Reject invalid payloads with failed acknowledgement including error_code and diagnostic message.
|
||||||
|
4. Reject stale commands when current time exceeds expires_at.
|
||||||
|
5. Ignore already-processed command_id values.
|
||||||
|
|
||||||
|
### 2. Idempotency And Persistence
|
||||||
|
1. Persist processed command_id and execution result on local storage.
|
||||||
|
2. Persistence must survive service restart and full OS reboot.
|
||||||
|
3. On restart, reload dedupe cache before processing newly delivered commands.
|
||||||
|
|
||||||
|
### 3. Acknowledgement Contract Behavior
|
||||||
|
1. Emit accepted immediately after successful validation and dedupe pass.
|
||||||
|
2. Emit execution_started immediately before invoking the command action.
|
||||||
|
3. Emit completed only when local success condition is confirmed.
|
||||||
|
4. Emit failed with structured error_code on validation or execution failure.
|
||||||
|
5. If MQTT is temporarily unavailable, retry ack publish with bounded backoff until command expiry.
|
||||||
|
|
||||||
|
### 4. Execution Security Model
|
||||||
|
1. Execute via systemd-managed privileged helper.
|
||||||
|
2. Allow only whitelisted operations:
|
||||||
|
- reboot_host
|
||||||
|
- shutdown_host
|
||||||
|
3. Optionally keep restart_service handler as reserved path, but do not require it for v1 conformance.
|
||||||
|
4. Disallow arbitrary shell commands and untrusted arguments.
|
||||||
|
5. Enforce per-command execution timeout and terminate hung child processes.
|
||||||
|
|
||||||
|
### 5. Reboot Recovery Continuity
|
||||||
|
1. For reboot_host action:
|
||||||
|
- send execution_started
|
||||||
|
- trigger reboot promptly
|
||||||
|
2. During startup:
|
||||||
|
- emit heartbeat early
|
||||||
|
- emit process-health once service is ready
|
||||||
|
3. Keep last command execution state available after reboot for reconciliation.
|
||||||
|
|
||||||
|
### 6. Time And Timeout Semantics
|
||||||
|
1. Use monotonic timers for local elapsed-time checks.
|
||||||
|
2. Use UTC wall-clock only for protocol timestamps and expiry comparisons.
|
||||||
|
3. Target reconnect baseline on Pi 5 USB-SATA SSD: 90 seconds.
|
||||||
|
4. Accept cold-boot and USB enumeration ceiling up to 150 seconds.
|
||||||
|
|
||||||
|
### 7. Capability Reporting
|
||||||
|
1. Report recovery capability class:
|
||||||
|
- software_only
|
||||||
|
- managed_poe_available
|
||||||
|
- manual_only
|
||||||
|
2. Report watchdog enabled status.
|
||||||
|
3. Report boot-source metadata for diagnostics.
|
||||||
|
|
||||||
|
### 8. Error Codes Minimum Set
|
||||||
|
1. invalid_schema
|
||||||
|
2. missing_field
|
||||||
|
3. stale_command
|
||||||
|
4. duplicate_command
|
||||||
|
5. permission_denied_local
|
||||||
|
6. execution_timeout
|
||||||
|
7. execution_failed
|
||||||
|
8. broker_unavailable
|
||||||
|
9. internal_error
|
||||||
|
|
||||||
|
### Acceptance Tests (Client Team)
|
||||||
|
1. Invalid schema payload is rejected and failed ack emitted.
|
||||||
|
2. Expired command is rejected and not executed.
|
||||||
|
3. Duplicate command_id is not executed twice.
|
||||||
|
4. reboot_host emits execution_started and reconnects with heartbeat in expected window.
|
||||||
|
5. restart_service action completes without host reboot and emits completed.
|
||||||
|
6. MQTT outage during ack path retries correctly without duplicate execution.
|
||||||
|
7. Boot-loop protection cooperates with server-side lockout semantics.
|
||||||
|
|
||||||
|
### Delivery Artifacts
|
||||||
|
1. Client protocol conformance checklist.
|
||||||
|
2. Test evidence for all acceptance tests.
|
||||||
|
3. Runtime logs showing full lifecycle for one restart and one reboot scenario.
|
||||||
|
4. Known limitations list per image version.
|
||||||
|
|
||||||
|
### Definition Of Done
|
||||||
|
1. All acceptance tests pass on Pi 5 USB-SATA SSD test devices.
|
||||||
|
2. No duplicate execution observed under reconnect and retained-delivery edge cases.
|
||||||
|
3. Acknowledgement sequence is complete and machine-parseable for server correlation.
|
||||||
|
4. Reboot recovery continuity works without managed PoE dependencies.
|
||||||
214
implementation-plans/reboot-implementation-handoff-share.md
Normal file
214
implementation-plans/reboot-implementation-handoff-share.md
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
## Remote Reboot Reliability Handoff (Share Document)
|
||||||
|
|
||||||
|
### Purpose
|
||||||
|
This document defines the agreed implementation scope for reliable remote reboot and shutdown of Raspberry Pi 5 clients, with monitoring-first visibility and safe escalation paths.
|
||||||
|
|
||||||
|
### Scope
|
||||||
|
1. In scope: restart and shutdown command reliability.
|
||||||
|
2. In scope: full lifecycle monitoring and audit visibility.
|
||||||
|
3. In scope: capability-tier recovery model with optional managed PoE escalation.
|
||||||
|
4. Out of scope: broader maintenance module in client-management for this cycle.
|
||||||
|
5. Out of scope: mandatory dependency on customer-managed power switching.
|
||||||
|
|
||||||
|
### Agreed Operating Model
|
||||||
|
1. Command delivery is asynchronous and lifecycle-tracked, not fire-and-forget.
|
||||||
|
2. Commands use idempotent command_id semantics with stale-command rejection by expires_at.
|
||||||
|
3. Monitoring is authoritative for operational state and escalation decisions.
|
||||||
|
4. Recovery must function even when no managed power switching is available.
|
||||||
|
|
||||||
|
### Frozen Contract v1 (Effective Immediately)
|
||||||
|
1. Canonical command topic: infoscreen/{client_uuid}/commands.
|
||||||
|
2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack.
|
||||||
|
3. Transitional compatibility topics accepted during migration:
|
||||||
|
- infoscreen/{client_uuid}/command
|
||||||
|
- infoscreen/{client_uuid}/command/ack
|
||||||
|
4. QoS policy: command QoS 1, ack QoS 1 recommended.
|
||||||
|
5. Retain policy: commands and acks are non-retained.
|
||||||
|
|
||||||
|
Command payload schema (frozen):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": "2026-04-03T12:48:10Z",
|
||||||
|
"expires_at": "2026-04-03T12:52:10Z",
|
||||||
|
"requested_by": 1,
|
||||||
|
"reason": "operator_request"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Ack payload schema (frozen):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"command_id": "5d1f8b4b-7e85-44fb-8f38-3f5d5da5e2e4",
|
||||||
|
"status": "execution_started",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Allowed ack status values:
|
||||||
|
1. accepted
|
||||||
|
2. execution_started
|
||||||
|
3. completed
|
||||||
|
4. failed
|
||||||
|
|
||||||
|
Frozen command action values:
|
||||||
|
1. reboot_host
|
||||||
|
2. shutdown_host
|
||||||
|
|
||||||
|
API endpoint mapping:
|
||||||
|
1. POST /api/clients/{uuid}/restart -> action reboot_host
|
||||||
|
2. POST /api/clients/{uuid}/shutdown -> action shutdown_host
|
||||||
|
|
||||||
|
Validation snippets:
|
||||||
|
1. Human-readable snippets: implementation-plans/reboot-command-payload-schemas.md
|
||||||
|
2. Machine-validated JSON Schema: implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### Command Lifecycle States
|
||||||
|
1. queued
|
||||||
|
2. publish_in_progress
|
||||||
|
3. published
|
||||||
|
4. ack_received
|
||||||
|
5. execution_started
|
||||||
|
6. awaiting_reconnect
|
||||||
|
7. recovered
|
||||||
|
8. completed
|
||||||
|
9. failed
|
||||||
|
10. expired
|
||||||
|
11. timed_out
|
||||||
|
12. canceled
|
||||||
|
13. blocked_safety
|
||||||
|
14. manual_intervention_required
|
||||||
|
|
||||||
|
### Timeout Defaults (Pi 5, USB-SATA SSD baseline)
|
||||||
|
1. queued to publish_in_progress: immediate, timeout 5 seconds.
|
||||||
|
2. publish_in_progress to published: timeout 8 seconds.
|
||||||
|
3. published to ack_received: timeout 20 seconds.
|
||||||
|
4. ack_received to execution_started: 15 seconds for service restart, 25 seconds for host reboot.
|
||||||
|
5. execution_started to awaiting_reconnect: timeout 10 seconds.
|
||||||
|
6. awaiting_reconnect to recovered: baseline 90 seconds after validation, cold-boot ceiling 150 seconds.
|
||||||
|
7. recovered to completed: 15 to 20 seconds based on fleet stability.
|
||||||
|
8. command expires_at default: 240 seconds, bounded 180 to 360 seconds.
|
||||||
|
|
||||||
|
### Recovery Tiers
|
||||||
|
1. Tier 0 baseline, always required: watchdog, systemd auto-restart, lifecycle tracking, manual intervention fallback.
|
||||||
|
2. Tier 1 optional: managed PoE per-port power-cycle escalation where customer infrastructure supports it.
|
||||||
|
3. Tier 2 no remote power control: direct manual intervention workflow.
|
||||||
|
|
||||||
|
### Governance And Safety
|
||||||
|
1. Role access: admin and superadmin.
|
||||||
|
2. Bulk actions require reason capture.
|
||||||
|
3. Safety lockout: maximum 3 reboot commands per client in 15 minutes.
|
||||||
|
4. Escalation cooldown: 60 seconds before automatic move to manual_intervention_required.
|
||||||
|
|
||||||
|
### MQTT Auth Hardening (Phase 1, Required Before Broad Rollout)
|
||||||
|
1. Intranet-only deployment is not sufficient protection for privileged MQTT actions by itself.
|
||||||
|
2. Phase 1 hardening scope is broker authentication, authorization, and network restriction; payload URL allowlisting is deferred to a later client/server feature.
|
||||||
|
3. MQTT broker must disable anonymous publish/subscribe access in production.
|
||||||
|
4. MQTT broker must require authenticated identities for server-side publishers and client devices.
|
||||||
|
5. MQTT broker must enforce ACLs so that:
|
||||||
|
- only server-side services can publish to `infoscreen/{client_uuid}/commands`
|
||||||
|
- only server-side services can publish scheduler event topics
|
||||||
|
- each client can subscribe only to its own command topics and assigned event topics
|
||||||
|
- each client can publish only its own ack, heartbeat, health, dashboard, and telemetry topics
|
||||||
|
6. Broker port exposure must be restricted to the management network and approved hosts only.
|
||||||
|
7. TLS support is strongly recommended in this phase and should be enabled when operationally feasible.
|
||||||
|
|
||||||
|
### Server Team Actions For Auth Hardening
|
||||||
|
1. Provision broker credentials for command/event publishers and for client devices.
|
||||||
|
2. Configure Mosquitto or equivalent broker ACLs for per-topic publish and subscribe restrictions.
|
||||||
|
3. Disable anonymous access on production brokers.
|
||||||
|
4. Restrict broker network exposure with firewall rules, VLAN policy, or equivalent network controls.
|
||||||
|
5. Update server/frontend deployment to publish MQTT with authenticated credentials.
|
||||||
|
6. Validate that server-side event publishing and reboot/shutdown command publishing still work under the new ACL policy.
|
||||||
|
7. Coordinate credential distribution and rotation with the client deployment process.
|
||||||
|
|
||||||
|
### MQTT ACL Matrix (Canonical Baseline)
|
||||||
|
| Actor | Topic Pattern | Publish | Subscribe | Notes |
|
||||||
|
| --- | --- | --- | --- | --- |
|
||||||
|
| scheduler-service | infoscreen/events/+ | Yes | No | Publishes retained active event list per group. |
|
||||||
|
| api-command-publisher | infoscreen/+/commands | Yes | No | Publishes canonical reboot/shutdown commands. |
|
||||||
|
| api-command-publisher | infoscreen/+/command | Yes | No | Transitional compatibility publish only. |
|
||||||
|
| api-group-assignment | infoscreen/+/group_id | Yes | No | Publishes retained client-to-group assignment. |
|
||||||
|
| listener-service | infoscreen/+/commands/ack | No | Yes | Consumes canonical client command acknowledgements. |
|
||||||
|
| listener-service | infoscreen/+/command/ack | No | Yes | Consumes transitional compatibility acknowledgements. |
|
||||||
|
| listener-service | infoscreen/+/heartbeat | No | Yes | Consumes heartbeat telemetry. |
|
||||||
|
| listener-service | infoscreen/+/health | No | Yes | Consumes health telemetry. |
|
||||||
|
| listener-service | infoscreen/+/dashboard | No | Yes | Consumes dashboard screenshot payloads. |
|
||||||
|
| listener-service | infoscreen/+/screenshot | No | Yes | Consumes screenshot payloads (if enabled). |
|
||||||
|
| listener-service | infoscreen/+/logs/error | No | Yes | Consumes client error logs. |
|
||||||
|
| listener-service | infoscreen/+/logs/warn | No | Yes | Consumes client warn logs. |
|
||||||
|
| listener-service | infoscreen/+/logs/info | No | Yes | Consumes client info logs. |
|
||||||
|
| listener-service | infoscreen/discovery | No | Yes | Consumes discovery announcements. |
|
||||||
|
| listener-service | infoscreen/+/discovery_ack | Yes | No | Publishes discovery acknowledgements. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/commands | No | Yes | Canonical command intake for this client only. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/command | No | Yes | Transitional compatibility intake for this client only. |
|
||||||
|
| client-<uuid> | infoscreen/events/<group_id> | No | Yes | Assigned group event feed only; dynamic per assignment. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/commands/ack | Yes | No | Canonical command acknowledgements for this client only. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/command/ack | Yes | No | Transitional compatibility acknowledgements for this client only. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/heartbeat | Yes | No | Heartbeat telemetry. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/health | Yes | No | Health telemetry. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/dashboard | Yes | No | Dashboard status and screenshot payloads. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/screenshot | Yes | No | Screenshot payloads (if enabled). |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/logs/error | Yes | No | Error log stream. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/logs/warn | Yes | No | Warning log stream. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/logs/info | Yes | No | Info log stream. |
|
||||||
|
| client-<uuid> | infoscreen/discovery | Yes | No | Discovery announcement. |
|
||||||
|
| client-<uuid> | infoscreen/<uuid>/discovery_ack | No | Yes | Discovery acknowledgment from listener. |
|
||||||
|
|
||||||
|
ACL implementation notes:
|
||||||
|
1. Use per-client identities; client ACLs must be scoped to exact client UUID and must not allow wildcard access to other clients.
|
||||||
|
2. Event topic subscription (`infoscreen/events/<group_id>`) should be managed via broker-side ACL provisioning that updates when group assignment changes.
|
||||||
|
3. Transitional singular command topics are temporary and should be removed after migration cutover.
|
||||||
|
4. Deny by default: any topic not explicitly listed above should be blocked for each actor.
|
||||||
|
|
||||||
|
### Credential Management Guidance
|
||||||
|
1. Real MQTT passwords must not be stored in tracked documentation or committed templates.
|
||||||
|
2. Each client device should receive a unique broker username and password, stored only in its local [/.env](.env).
|
||||||
|
3. Server-side publisher credentials should be stored in the server team's secret-management path, not in source control.
|
||||||
|
4. Recommended naming convention for client broker users: `infoscreen-client-<client-uuid-prefix>`.
|
||||||
|
5. Client passwords should be random, at least 20 characters, and rotated through deployment tooling or broker administration procedures.
|
||||||
|
6. The server/infrastructure team owns broker-side user creation, ACL assignment, rotation, and revocation.
|
||||||
|
7. The client team owns loading credentials from local env files and validating connection behavior against the secured broker.
|
||||||
|
|
||||||
|
### Client Team Actions For Auth Hardening
|
||||||
|
1. Add MQTT username/password support in the client connection setup.
|
||||||
|
2. Add client-side TLS configuration support from environment when certificates are provided.
|
||||||
|
3. Update local test helpers to support authenticated MQTT publishing and subscription.
|
||||||
|
4. Validate command and event intake against the authenticated broker configuration before canary rollout.
|
||||||
|
|
||||||
|
### Ready For Server/Frontend Team (Auth Phase)
|
||||||
|
1. Client implementation is ready to connect with MQTT auth from local `.env` (`MQTT_USERNAME`, `MQTT_PASSWORD`, optional TLS settings).
|
||||||
|
2. Client command/event intake and client ack/telemetry publishing run over the authenticated MQTT session.
|
||||||
|
3. Server/frontend team must now complete broker-side enforcement and publisher migration.
|
||||||
|
|
||||||
|
Server/frontend done criteria:
|
||||||
|
1. Anonymous broker access is disabled in production.
|
||||||
|
2. Server-side publishers use authenticated broker credentials.
|
||||||
|
3. ACLs are active and validated for command, event, and client telemetry topics.
|
||||||
|
4. At least one canary client proves end-to-end flow under ACLs:
|
||||||
|
- server publishes command/event with authenticated publisher
|
||||||
|
- client receives payload
|
||||||
|
- client sends ack/telemetry successfully
|
||||||
|
5. Revocation test passes: disabling one client credential blocks only that client without impacting others.
|
||||||
|
|
||||||
|
Operational note:
|
||||||
|
1. Client-side auth support is necessary but not sufficient by itself; broker ACL/auth enforcement is the security control that must be enabled by the server/infrastructure team.
|
||||||
|
|
||||||
|
### Rollout Plan
|
||||||
|
1. Contract freeze and sign-off.
|
||||||
|
2. Platform and client implementation against frozen schemas.
|
||||||
|
3. One-group canary.
|
||||||
|
4. Rollback if failed plus timed_out exceeds 5 percent.
|
||||||
|
5. Expand only after 7 days below intervention threshold.
|
||||||
|
|
||||||
|
### Success Criteria
|
||||||
|
1. Deterministic command lifecycle visibility from enqueue to completion.
|
||||||
|
2. No duplicate execution under reconnect or delayed-delivery conditions.
|
||||||
|
3. Stable Pi 5 SSD reconnect behavior within defined baseline.
|
||||||
|
4. Clear and actionable manual intervention states when automatic recovery is exhausted.
|
||||||
54
implementation-plans/reboot-kickoff-summary.md
Normal file
54
implementation-plans/reboot-kickoff-summary.md
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
## Reboot Reliability Kickoff Summary
|
||||||
|
|
||||||
|
### Objective
|
||||||
|
Ship a reliable, observable restart and shutdown workflow for Raspberry Pi 5 clients, with safe escalation and clear operator outcomes.
|
||||||
|
|
||||||
|
### What Is Included
|
||||||
|
1. Asynchronous command lifecycle with idempotent command_id handling.
|
||||||
|
2. Monitoring-first state visibility from queued to terminal outcomes.
|
||||||
|
3. Client acknowledgements for accepted, execution_started, completed, and failed.
|
||||||
|
4. Pi 5 USB-SATA SSD timeout baseline and tuning rules.
|
||||||
|
5. Capability-tier recovery with optional managed PoE escalation.
|
||||||
|
|
||||||
|
### What Is Not Included
|
||||||
|
1. Full maintenance module in client-management.
|
||||||
|
2. Required managed power-switch integration.
|
||||||
|
3. Production Wake-on-LAN rollout.
|
||||||
|
|
||||||
|
### Team Split
|
||||||
|
1. Platform team: API command lifecycle, safety controls, listener ack ingestion.
|
||||||
|
2. Web team: lifecycle-aware UX and command status display.
|
||||||
|
3. Client team: strict validation, dedupe, ack sequence, secure execution helper, reboot continuity.
|
||||||
|
|
||||||
|
### Ownership Matrix
|
||||||
|
| Team | Primary Plan File | Main Deliverables |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Platform team | implementation-plans/reboot-implementation-handoff-share.md | Command lifecycle backend, policy enforcement, listener ack mapping, safety lockout and escalation |
|
||||||
|
| Web team | implementation-plans/reboot-implementation-handoff-share.md | Lifecycle UI states, bulk safety UX, capability visibility, command status polling |
|
||||||
|
| Client team | implementation-plans/reboot-implementation-handoff-client-team.md | Command validation, dedupe persistence, ack sequence, secure execution helper, reboot continuity |
|
||||||
|
| Project coordination | implementation-plans/reboot-kickoff-summary.md | Phase sequencing, canary gates, rollback thresholds, cross-team sign-off tracking |
|
||||||
|
|
||||||
|
### Baseline Operational Defaults
|
||||||
|
1. Safety lockout: 3 reboot commands per client in rolling 15 minutes.
|
||||||
|
2. Escalation cooldown: 60 seconds.
|
||||||
|
3. Reconnect target on Pi 5 SSD: 90 seconds baseline, 150 seconds cold-boot ceiling.
|
||||||
|
4. Rollback canary trigger: failed plus timed_out above 5 percent.
|
||||||
|
|
||||||
|
### Frozen Contract Snapshot
|
||||||
|
1. Canonical command topic: infoscreen/{client_uuid}/commands.
|
||||||
|
2. Canonical ack topic: infoscreen/{client_uuid}/commands/ack.
|
||||||
|
3. Transitional compatibility topics during migration:
|
||||||
|
- infoscreen/{client_uuid}/command
|
||||||
|
- infoscreen/{client_uuid}/command/ack
|
||||||
|
4. Command schema version: 1.0.
|
||||||
|
5. Allowed command actions: reboot_host, shutdown_host.
|
||||||
|
6. Allowed ack status values: accepted, execution_started, completed, failed.
|
||||||
|
7. Validation snippets:
|
||||||
|
- implementation-plans/reboot-command-payload-schemas.md
|
||||||
|
- implementation-plans/reboot-command-payload-schemas.json
|
||||||
|
|
||||||
|
### Immediate Next Steps
|
||||||
|
1. Continue implementation in parallel by team against frozen contract.
|
||||||
|
2. Client team validates dedupe and expiry handling on canonical topics.
|
||||||
|
3. Platform team verifies ack-state transitions for accepted, execution_started, completed, failed.
|
||||||
|
4. Execute one-group canary and validate timing plus failure drills.
|
||||||
127
implementation-plans/server-team-actions.md
Normal file
127
implementation-plans/server-team-actions.md
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# Server Team Action Items — Infoscreen Client
|
||||||
|
|
||||||
|
This document lists everything the server/infrastructure/frontend team must implement to complete the client integration. The client-side code is production-ready for all items listed here.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. MQTT Broker Hardening (prerequisite for everything else)
|
||||||
|
|
||||||
|
- Disable anonymous access on the broker.
|
||||||
|
- Create one broker account **per client device**:
|
||||||
|
- Username convention: `infoscreen-client-<uuid-prefix>` (e.g. `infoscreen-client-9b8d1856`)
|
||||||
|
- Provision the password to the device `.env` as `MQTT_PASSWORD_BROKER=`
|
||||||
|
- Create a **server/publisher account** (e.g. `infoscreen-server`) for all server-side publishes.
|
||||||
|
- Enforce ACLs:
|
||||||
|
|
||||||
|
| Topic | Publisher |
|
||||||
|
|---|---|
|
||||||
|
| `infoscreen/{uuid}/commands` | server only |
|
||||||
|
| `infoscreen/{uuid}/command` (alias) | server only |
|
||||||
|
| `infoscreen/{uuid}/group_id` | server only |
|
||||||
|
| `infoscreen/events/{group_id}` | server only |
|
||||||
|
| `infoscreen/groups/+/power/intent` | server only |
|
||||||
|
| `infoscreen/{uuid}/commands/ack` | client only |
|
||||||
|
| `infoscreen/{uuid}/command/ack` | client only |
|
||||||
|
| `infoscreen/{uuid}/heartbeat` | client only |
|
||||||
|
| `infoscreen/{uuid}/health` | client only |
|
||||||
|
| `infoscreen/{uuid}/logs/#` | client only |
|
||||||
|
| `infoscreen/{uuid}/service_failed` | client only |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Reboot / Shutdown Command — Ack Lifecycle
|
||||||
|
|
||||||
|
Client publishes ack status updates to two topics per command (canonical + transitional alias):
|
||||||
|
- `infoscreen/{uuid}/commands/ack`
|
||||||
|
- `infoscreen/{uuid}/command/ack`
|
||||||
|
|
||||||
|
**Ack payload schema (v1, frozen):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"command_id": "07aab032-53c2-45ef-a5a3-6aa58e9d9fae",
|
||||||
|
"status": "accepted | execution_started | completed | failed",
|
||||||
|
"error_code": null,
|
||||||
|
"error_message": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Status lifecycle:**
|
||||||
|
|
||||||
|
| Status | When | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `accepted` | Command received and validated | Immediate |
|
||||||
|
| `execution_started` | Helper invoked | Immediate after accepted |
|
||||||
|
| `completed` | Execution confirmed | For `reboot_host`: arrives after reconnect (10–90 s after `execution_started`) |
|
||||||
|
| `failed` | Helper returned error | `error_code` and `error_message` will be set |
|
||||||
|
|
||||||
|
**Server must:**
|
||||||
|
- Track `command_id` through the full lifecycle and update status in DB/UI.
|
||||||
|
- Surface `failed` + `error_code` to the operator UI.
|
||||||
|
- Expect `reboot_host` `completed` to arrive after a reconnect delay — do not treat the gap as a timeout.
|
||||||
|
- Use `expires_at` from the original command to determine when to abandon waiting.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Health Dashboard — Broker Connection Fields (Gap 2)
|
||||||
|
|
||||||
|
Every `infoscreen/{uuid}/health` payload now includes a `broker_connection` block:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"timestamp": "2026-04-05T08:00:00.000000+00:00",
|
||||||
|
"expected_state": { "event_id": 42 },
|
||||||
|
"actual_state": {
|
||||||
|
"process": "display_manager",
|
||||||
|
"pid": 1234,
|
||||||
|
"status": "running"
|
||||||
|
},
|
||||||
|
"broker_connection": {
|
||||||
|
"broker_reachable": true,
|
||||||
|
"reconnect_count": 2,
|
||||||
|
"last_disconnect_at": "2026-04-04T10:30:00Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Server must:**
|
||||||
|
- Display `reconnect_count` and `last_disconnect_at` per device in the health dashboard.
|
||||||
|
- Implement alerting heuristic:
|
||||||
|
- **All** clients go silent simultaneously → likely broker outage, not device crash.
|
||||||
|
- **Single** client goes silent → device crash, network failure, or process hang.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Service-Failed MQTT Notification (Gap 3)
|
||||||
|
|
||||||
|
When systemd gives up restarting a service after repeated crashes (`StartLimitBurst` exceeded), the client automatically publishes a **retained** message:
|
||||||
|
|
||||||
|
**Topic:** `infoscreen/{uuid}/service_failed`
|
||||||
|
|
||||||
|
**Payload:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"event": "service_failed",
|
||||||
|
"unit": "infoscreen-simclient.service",
|
||||||
|
"client_uuid": "9b8d1856-ff34-4864-a726-12de072d0f77",
|
||||||
|
"failed_at": "2026-04-05T08:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Server must:**
|
||||||
|
- Subscribe to `infoscreen/+/service_failed` on startup (retained — message survives broker restart).
|
||||||
|
- Alert the operator immediately when this topic receives a payload.
|
||||||
|
- **Clear the retained message** once the device is acknowledged or recovered:
|
||||||
|
```
|
||||||
|
mosquitto_pub -t "infoscreen/{uuid}/service_failed" -n --retain
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. No Server Action Required
|
||||||
|
|
||||||
|
These items are fully implemented client-side and require no server changes:
|
||||||
|
|
||||||
|
- systemd watchdog (`WatchdogSec=60`) — hangs detected and process restarted automatically.
|
||||||
|
- Command deduplication — `command_id` deduplicated with 24-hour TTL.
|
||||||
|
- Ack retry backoff — client retries ack publish on broker disconnect until `expires_at`.
|
||||||
|
- Mock helper / test mode (`COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE`) — development only.
|
||||||
@@ -4,11 +4,12 @@ import logging
|
|||||||
import datetime
|
import datetime
|
||||||
import base64
|
import base64
|
||||||
import re
|
import re
|
||||||
|
import ssl
|
||||||
import requests
|
import requests
|
||||||
import paho.mqtt.client as mqtt
|
import paho.mqtt.client as mqtt
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from models.models import Client, ClientLog, LogLevel, ProcessStatus, ScreenHealthStatus
|
from models.models import Client, ClientLog, ClientCommand, LogLevel, ProcessStatus, ScreenHealthStatus
|
||||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s')
|
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||||
|
|
||||||
# Load .env only when not already configured by Docker (API_BASE_URL not set by compose means we're outside a container)
|
# Load .env only when not already configured by Docker (API_BASE_URL not set by compose means we're outside a container)
|
||||||
@@ -32,6 +33,16 @@ Session = sessionmaker(bind=engine)
|
|||||||
# API configuration
|
# API configuration
|
||||||
API_BASE_URL = os.getenv("API_BASE_URL", "http://server:8000")
|
API_BASE_URL = os.getenv("API_BASE_URL", "http://server:8000")
|
||||||
|
|
||||||
|
MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", "mqtt")
|
||||||
|
MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883")))
|
||||||
|
MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME")
|
||||||
|
MQTT_PASSWORD = os.getenv("MQTT_PASSWORD")
|
||||||
|
MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||||
|
MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT")
|
||||||
|
MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE")
|
||||||
|
MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE")
|
||||||
|
MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||||
|
|
||||||
# Dashboard payload migration observability
|
# Dashboard payload migration observability
|
||||||
DASHBOARD_METRICS_LOG_EVERY = int(os.getenv("DASHBOARD_METRICS_LOG_EVERY", "5"))
|
DASHBOARD_METRICS_LOG_EVERY = int(os.getenv("DASHBOARD_METRICS_LOG_EVERY", "5"))
|
||||||
DASHBOARD_PARSE_METRICS = {
|
DASHBOARD_PARSE_METRICS = {
|
||||||
@@ -376,8 +387,11 @@ def on_connect(client, userdata, flags, reasonCode, properties):
|
|||||||
client.subscribe("infoscreen/+/logs/warn")
|
client.subscribe("infoscreen/+/logs/warn")
|
||||||
client.subscribe("infoscreen/+/logs/info")
|
client.subscribe("infoscreen/+/logs/info")
|
||||||
client.subscribe("infoscreen/+/health")
|
client.subscribe("infoscreen/+/health")
|
||||||
|
client.subscribe("infoscreen/+/commands/ack")
|
||||||
|
client.subscribe("infoscreen/+/command/ack")
|
||||||
|
client.subscribe("infoscreen/+/service_failed")
|
||||||
|
|
||||||
logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, and health")
|
logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, health, and service_failed")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Subscribe failed on connect: {e}")
|
logging.error(f"Subscribe failed on connect: {e}")
|
||||||
|
|
||||||
@@ -387,6 +401,72 @@ def on_message(client, userdata, msg):
|
|||||||
logging.debug(f"Empfangene Nachricht auf Topic: {topic}")
|
logging.debug(f"Empfangene Nachricht auf Topic: {topic}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Command acknowledgement handling
|
||||||
|
if topic.startswith("infoscreen/") and (topic.endswith("/commands/ack") or topic.endswith("/command/ack")):
|
||||||
|
uuid = topic.split("/")[1]
|
||||||
|
try:
|
||||||
|
payload = json.loads(msg.payload.decode())
|
||||||
|
except (json.JSONDecodeError, UnicodeDecodeError):
|
||||||
|
logging.error(f"Ungueltiges Command-ACK Payload von {uuid}")
|
||||||
|
return
|
||||||
|
|
||||||
|
command_id = payload.get("command_id")
|
||||||
|
ack_status = str(payload.get("status", "")).strip().lower()
|
||||||
|
error_code = payload.get("error_code")
|
||||||
|
error_message = payload.get("error_message")
|
||||||
|
|
||||||
|
if not command_id:
|
||||||
|
logging.warning(f"Command-ACK ohne command_id von {uuid}")
|
||||||
|
return
|
||||||
|
|
||||||
|
status_map = {
|
||||||
|
"accepted": "ack_received",
|
||||||
|
"execution_started": "execution_started",
|
||||||
|
"completed": "completed",
|
||||||
|
"failed": "failed",
|
||||||
|
}
|
||||||
|
mapped_status = status_map.get(ack_status)
|
||||||
|
if not mapped_status:
|
||||||
|
logging.warning(f"Unbekannter Command-ACK Status '{ack_status}' von {uuid}")
|
||||||
|
return
|
||||||
|
|
||||||
|
db_session = Session()
|
||||||
|
try:
|
||||||
|
command_obj = db_session.query(ClientCommand).filter_by(command_id=command_id).first()
|
||||||
|
if not command_obj:
|
||||||
|
logging.warning(f"Command-ACK fuer unbekanntes command_id={command_id} von {uuid}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Ignore stale/duplicate regressions.
|
||||||
|
terminal_states = {"completed", "failed", "expired", "canceled", "blocked_safety"}
|
||||||
|
if command_obj.status in terminal_states:
|
||||||
|
logging.info(
|
||||||
|
f"Command-ACK ignoriert (bereits terminal): command_id={command_id}, status={command_obj.status}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
now_utc = datetime.datetime.now(datetime.UTC)
|
||||||
|
command_obj.status = mapped_status
|
||||||
|
if mapped_status == "ack_received":
|
||||||
|
command_obj.acked_at = now_utc
|
||||||
|
elif mapped_status == "execution_started":
|
||||||
|
command_obj.execution_started_at = now_utc
|
||||||
|
elif mapped_status == "completed":
|
||||||
|
command_obj.completed_at = now_utc
|
||||||
|
elif mapped_status == "failed":
|
||||||
|
command_obj.failed_at = now_utc
|
||||||
|
command_obj.error_code = str(error_code) if error_code is not None else command_obj.error_code
|
||||||
|
command_obj.error_message = str(error_message) if error_message is not None else command_obj.error_message
|
||||||
|
|
||||||
|
db_session.commit()
|
||||||
|
logging.info(f"Command-ACK verarbeitet: command_id={command_id}, status={mapped_status}, uuid={uuid}")
|
||||||
|
except Exception as e:
|
||||||
|
db_session.rollback()
|
||||||
|
logging.error(f"Fehler bei Command-ACK Verarbeitung ({command_id}): {e}")
|
||||||
|
finally:
|
||||||
|
db_session.close()
|
||||||
|
return
|
||||||
|
|
||||||
# Dashboard-Handling (nested screenshot payload)
|
# Dashboard-Handling (nested screenshot payload)
|
||||||
if topic.startswith("infoscreen/") and topic.endswith("/dashboard"):
|
if topic.startswith("infoscreen/") and topic.endswith("/dashboard"):
|
||||||
uuid = topic.split("/")[1]
|
uuid = topic.split("/")[1]
|
||||||
@@ -506,6 +586,43 @@ def on_message(client, userdata, msg):
|
|||||||
logging.error(f"Could not parse log payload from {uuid}: {e}")
|
logging.error(f"Could not parse log payload from {uuid}: {e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Service-failed handling (systemd gave up restarting — retained message)
|
||||||
|
if topic.startswith("infoscreen/") and topic.endswith("/service_failed"):
|
||||||
|
uuid = topic.split("/")[1]
|
||||||
|
# Empty payload = retained message cleared; ignore it.
|
||||||
|
if not msg.payload:
|
||||||
|
logging.info(f"service_failed retained message cleared for {uuid}")
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
payload_data = json.loads(msg.payload.decode())
|
||||||
|
failed_at_str = payload_data.get("failed_at")
|
||||||
|
unit = payload_data.get("unit", "")
|
||||||
|
try:
|
||||||
|
failed_at = datetime.datetime.fromisoformat(failed_at_str.replace("Z", "+00:00")) if failed_at_str else datetime.datetime.now(datetime.UTC)
|
||||||
|
if failed_at.tzinfo is None:
|
||||||
|
failed_at = failed_at.replace(tzinfo=datetime.UTC)
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
failed_at = datetime.datetime.now(datetime.UTC)
|
||||||
|
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
client_obj = session.query(Client).filter_by(uuid=uuid).first()
|
||||||
|
if client_obj:
|
||||||
|
client_obj.service_failed_at = failed_at
|
||||||
|
client_obj.service_failed_unit = unit[:128] if unit else None
|
||||||
|
session.commit()
|
||||||
|
logging.warning(f"event=service_failed uuid={uuid} unit={unit} failed_at={failed_at.isoformat()}")
|
||||||
|
else:
|
||||||
|
logging.warning(f"service_failed received for unknown client uuid={uuid}")
|
||||||
|
except Exception as e:
|
||||||
|
session.rollback()
|
||||||
|
logging.error(f"Error persisting service_failed for {uuid}: {e}")
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
||||||
|
logging.error(f"Could not parse service_failed payload from {uuid}: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
# Health-Handling
|
# Health-Handling
|
||||||
if topic.startswith("infoscreen/") and topic.endswith("/health"):
|
if topic.startswith("infoscreen/") and topic.endswith("/health"):
|
||||||
uuid = topic.split("/")[1]
|
uuid = topic.split("/")[1]
|
||||||
@@ -531,6 +648,26 @@ def on_message(client, userdata, msg):
|
|||||||
screen_health_status=screen_health_status,
|
screen_health_status=screen_health_status,
|
||||||
last_screenshot_analyzed=parse_timestamp((payload_data.get('health_metrics') or {}).get('last_frame_update')),
|
last_screenshot_analyzed=parse_timestamp((payload_data.get('health_metrics') or {}).get('last_frame_update')),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Update broker connection health fields
|
||||||
|
broker_conn = payload_data.get('broker_connection')
|
||||||
|
if isinstance(broker_conn, dict):
|
||||||
|
reconnect_count = broker_conn.get('reconnect_count')
|
||||||
|
last_disconnect_str = broker_conn.get('last_disconnect_at')
|
||||||
|
if reconnect_count is not None:
|
||||||
|
try:
|
||||||
|
client_obj.mqtt_reconnect_count = int(reconnect_count)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
if last_disconnect_str:
|
||||||
|
try:
|
||||||
|
last_disconnect = datetime.datetime.fromisoformat(last_disconnect_str.replace('Z', '+00:00'))
|
||||||
|
if last_disconnect.tzinfo is None:
|
||||||
|
last_disconnect = last_disconnect.replace(tzinfo=datetime.UTC)
|
||||||
|
client_obj.mqtt_last_disconnect_at = last_disconnect
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
pass
|
||||||
|
|
||||||
session.commit()
|
session.commit()
|
||||||
logging.debug(f"Health update from {uuid}: {actual.get('process')} ({actual.get('status')})")
|
logging.debug(f"Health update from {uuid}: {actual.get('process')} ({actual.get('status')})")
|
||||||
session.close()
|
session.close()
|
||||||
@@ -589,9 +726,29 @@ def main():
|
|||||||
mqtt_client.on_connect = on_connect
|
mqtt_client.on_connect = on_connect
|
||||||
# Set an exponential reconnect delay to survive broker restarts
|
# Set an exponential reconnect delay to survive broker restarts
|
||||||
mqtt_client.reconnect_delay_set(min_delay=1, max_delay=60)
|
mqtt_client.reconnect_delay_set(min_delay=1, max_delay=60)
|
||||||
mqtt_client.connect("mqtt", 1883)
|
|
||||||
|
|
||||||
logging.info("Listener gestartet; warte auf MQTT-Verbindung und Nachrichten")
|
if MQTT_USERNAME and MQTT_PASSWORD:
|
||||||
|
mqtt_client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD)
|
||||||
|
|
||||||
|
if MQTT_TLS_ENABLED:
|
||||||
|
mqtt_client.tls_set(
|
||||||
|
ca_certs=MQTT_TLS_CA_CERT,
|
||||||
|
certfile=MQTT_TLS_CERTFILE,
|
||||||
|
keyfile=MQTT_TLS_KEYFILE,
|
||||||
|
cert_reqs=ssl.CERT_REQUIRED,
|
||||||
|
)
|
||||||
|
if MQTT_TLS_INSECURE:
|
||||||
|
mqtt_client.tls_insecure_set(True)
|
||||||
|
|
||||||
|
mqtt_client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT)
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
"Listener gestartet; warte auf MQTT-Verbindung und Nachrichten (host=%s port=%s tls=%s auth=%s)",
|
||||||
|
MQTT_BROKER_HOST,
|
||||||
|
MQTT_BROKER_PORT,
|
||||||
|
MQTT_TLS_ENABLED,
|
||||||
|
bool(MQTT_USERNAME and MQTT_PASSWORD),
|
||||||
|
)
|
||||||
mqtt_client.loop_forever()
|
mqtt_client.loop_forever()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -147,6 +147,14 @@ class Client(Base):
|
|||||||
screen_health_status = Column(Enum(ScreenHealthStatus), nullable=True, server_default='UNKNOWN')
|
screen_health_status = Column(Enum(ScreenHealthStatus), nullable=True, server_default='UNKNOWN')
|
||||||
last_screenshot_hash = Column(String(32), nullable=True)
|
last_screenshot_hash = Column(String(32), nullable=True)
|
||||||
|
|
||||||
|
# Systemd service-failed tracking
|
||||||
|
service_failed_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
|
service_failed_unit = Column(String(128), nullable=True)
|
||||||
|
|
||||||
|
# MQTT broker connection health
|
||||||
|
mqtt_reconnect_count = Column(Integer, nullable=True)
|
||||||
|
mqtt_last_disconnect_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
|
|
||||||
|
|
||||||
class ClientLog(Base):
|
class ClientLog(Base):
|
||||||
__tablename__ = 'client_logs'
|
__tablename__ = 'client_logs'
|
||||||
@@ -164,6 +172,33 @@ class ClientLog(Base):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ClientCommand(Base):
|
||||||
|
__tablename__ = 'client_commands'
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
command_id = Column(String(36), nullable=False, unique=True, index=True)
|
||||||
|
client_uuid = Column(String(36), ForeignKey('clients.uuid', ondelete='CASCADE'), nullable=False, index=True)
|
||||||
|
action = Column(String(32), nullable=False, index=True)
|
||||||
|
status = Column(String(40), nullable=False, index=True)
|
||||||
|
reason = Column(Text, nullable=True)
|
||||||
|
requested_by = Column(Integer, ForeignKey('users.id', ondelete='SET NULL'), nullable=True, index=True)
|
||||||
|
issued_at = Column(TIMESTAMP(timezone=True), nullable=False)
|
||||||
|
expires_at = Column(TIMESTAMP(timezone=True), nullable=False)
|
||||||
|
published_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
|
acked_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
|
execution_started_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
|
completed_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
|
failed_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
|
error_code = Column(String(64), nullable=True)
|
||||||
|
error_message = Column(Text, nullable=True)
|
||||||
|
created_at = Column(TIMESTAMP(timezone=True), server_default=func.current_timestamp(), nullable=False)
|
||||||
|
updated_at = Column(TIMESTAMP(timezone=True), server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=False)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index('ix_client_commands_client_status_created', 'client_uuid', 'status', 'created_at'),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class EventType(enum.Enum):
|
class EventType(enum.Enum):
|
||||||
presentation = "presentation"
|
presentation = "presentation"
|
||||||
website = "website"
|
website = "website"
|
||||||
|
|||||||
@@ -1,13 +1,14 @@
|
|||||||
# scheduler/db_utils.py
|
# scheduler/db_utils.py
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta, timezone
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from sqlalchemy.orm import sessionmaker, joinedload
|
from sqlalchemy.orm import sessionmaker, joinedload
|
||||||
from sqlalchemy import create_engine, or_, and_, text
|
from sqlalchemy import create_engine, or_, and_, text
|
||||||
from models.models import Event, EventMedia, EventException, SystemSetting
|
import uuid as _uuid_mod
|
||||||
|
from models.models import Event, EventMedia, EventException, SystemSetting, Client, ClientCommand, ProcessStatus
|
||||||
from dateutil.rrule import rrulestr
|
from dateutil.rrule import rrulestr
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
@@ -454,3 +455,167 @@ def format_event_with_media(event):
|
|||||||
# Add other event types (message, etc.) here as needed...
|
# Add other event types (message, etc.) here as needed...
|
||||||
|
|
||||||
return event_dict
|
return event_dict
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Crash detection / auto-recovery helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_CRASH_RECOVERY_SCHEMA_VERSION = "1.0"
|
||||||
|
_CRASH_COMMAND_TOPIC = "infoscreen/{uuid}/commands"
|
||||||
|
_CRASH_COMMAND_COMPAT_TOPIC = "infoscreen/{uuid}/command"
|
||||||
|
_CRASH_RECOVERY_EXPIRY_SECONDS = int(os.getenv("CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS", "240"))
|
||||||
|
_CRASH_RECOVERY_LOCKOUT_MINUTES = int(os.getenv("CRASH_RECOVERY_LOCKOUT_MINUTES", "15"))
|
||||||
|
|
||||||
|
|
||||||
|
def get_crash_recovery_candidates(heartbeat_grace_seconds: int) -> list:
|
||||||
|
"""
|
||||||
|
Returns a list of dicts for active clients that are crashed (process_status=crashed)
|
||||||
|
or heartbeat-stale, and don't already have a recent recovery command in the lockout window.
|
||||||
|
"""
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
stale_cutoff = now - timedelta(seconds=heartbeat_grace_seconds)
|
||||||
|
lockout_cutoff = now - timedelta(minutes=_CRASH_RECOVERY_LOCKOUT_MINUTES)
|
||||||
|
|
||||||
|
candidates = (
|
||||||
|
session.query(Client)
|
||||||
|
.filter(Client.is_active == True)
|
||||||
|
.filter(
|
||||||
|
or_(
|
||||||
|
Client.process_status == ProcessStatus.crashed,
|
||||||
|
Client.last_alive < stale_cutoff,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for c in candidates:
|
||||||
|
recent = (
|
||||||
|
session.query(ClientCommand)
|
||||||
|
.filter(ClientCommand.client_uuid == c.uuid)
|
||||||
|
.filter(ClientCommand.created_at >= lockout_cutoff)
|
||||||
|
.filter(ClientCommand.action.in_(["reboot_host", "restart_app"]))
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if recent:
|
||||||
|
continue
|
||||||
|
crash_reason = (
|
||||||
|
"process_crashed"
|
||||||
|
if c.process_status == ProcessStatus.crashed
|
||||||
|
else "heartbeat_stale"
|
||||||
|
)
|
||||||
|
result.append({
|
||||||
|
"uuid": c.uuid,
|
||||||
|
"reason": crash_reason,
|
||||||
|
"process_status": c.process_status.value if c.process_status else None,
|
||||||
|
"last_alive": c.last_alive,
|
||||||
|
})
|
||||||
|
return result
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
def issue_crash_recovery_command(client_uuid: str, reason: str) -> tuple:
|
||||||
|
"""
|
||||||
|
Writes a ClientCommand (reboot_host) for crash recovery to the DB.
|
||||||
|
Returns (command_id, payload_dict) for the caller to publish over MQTT.
|
||||||
|
Also returns the MQTT topic strings.
|
||||||
|
"""
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
expires_at = now + timedelta(seconds=_CRASH_RECOVERY_EXPIRY_SECONDS)
|
||||||
|
command_id = str(_uuid_mod.uuid4())
|
||||||
|
|
||||||
|
command = ClientCommand(
|
||||||
|
command_id=command_id,
|
||||||
|
client_uuid=client_uuid,
|
||||||
|
action="reboot_host",
|
||||||
|
status="queued",
|
||||||
|
reason=reason,
|
||||||
|
requested_by=None,
|
||||||
|
issued_at=now,
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
session.add(command)
|
||||||
|
session.commit()
|
||||||
|
command.status = "publish_in_progress"
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"schema_version": _CRASH_RECOVERY_SCHEMA_VERSION,
|
||||||
|
"command_id": command_id,
|
||||||
|
"client_uuid": client_uuid,
|
||||||
|
"action": "reboot_host",
|
||||||
|
"issued_at": now.isoformat().replace("+00:00", "Z"),
|
||||||
|
"expires_at": expires_at.isoformat().replace("+00:00", "Z"),
|
||||||
|
"requested_by": None,
|
||||||
|
"reason": reason,
|
||||||
|
}
|
||||||
|
topic = _CRASH_COMMAND_TOPIC.format(uuid=client_uuid)
|
||||||
|
compat_topic = _CRASH_COMMAND_COMPAT_TOPIC.format(uuid=client_uuid)
|
||||||
|
return command_id, payload, topic, compat_topic
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
def finalize_crash_recovery_command(command_id: str, published: bool, error: str = None) -> None:
|
||||||
|
"""Updates command status after MQTT publish attempt."""
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
cmd = session.query(ClientCommand).filter_by(command_id=command_id).first()
|
||||||
|
if not cmd:
|
||||||
|
return
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
if published:
|
||||||
|
cmd.status = "published"
|
||||||
|
cmd.published_at = now
|
||||||
|
else:
|
||||||
|
cmd.status = "failed"
|
||||||
|
cmd.failed_at = now
|
||||||
|
cmd.error_code = "mqtt_publish_failed"
|
||||||
|
cmd.error_message = error or "Unknown publish error"
|
||||||
|
session.commit()
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
_TERMINAL_COMMAND_STATUSES = {"completed", "failed", "expired", "canceled", "blocked_safety"}
|
||||||
|
|
||||||
|
|
||||||
|
def sweep_expired_commands() -> int:
|
||||||
|
"""Marks non-terminal commands whose expires_at has passed as expired.
|
||||||
|
|
||||||
|
Returns the number of commands updated.
|
||||||
|
"""
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
commands = (
|
||||||
|
session.query(ClientCommand)
|
||||||
|
.filter(
|
||||||
|
ClientCommand.expires_at < now,
|
||||||
|
ClientCommand.status.notin_(_TERMINAL_COMMAND_STATUSES),
|
||||||
|
)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
if not commands:
|
||||||
|
return 0
|
||||||
|
for cmd in commands:
|
||||||
|
cmd.status = "expired"
|
||||||
|
cmd.failed_at = now
|
||||||
|
cmd.error_code = "expired"
|
||||||
|
cmd.error_message = "Command expired before terminal state was reached."
|
||||||
|
session.commit()
|
||||||
|
return len(commands)
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|||||||
@@ -8,12 +8,28 @@ from .db_utils import (
|
|||||||
compute_group_power_intent_basis,
|
compute_group_power_intent_basis,
|
||||||
build_group_power_intent_body,
|
build_group_power_intent_body,
|
||||||
compute_group_power_intent_fingerprint,
|
compute_group_power_intent_fingerprint,
|
||||||
|
get_crash_recovery_candidates,
|
||||||
|
issue_crash_recovery_command,
|
||||||
|
finalize_crash_recovery_command,
|
||||||
|
sweep_expired_commands,
|
||||||
)
|
)
|
||||||
import paho.mqtt.client as mqtt
|
import paho.mqtt.client as mqtt
|
||||||
import json
|
import json
|
||||||
import datetime
|
import datetime
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
import ssl
|
||||||
|
|
||||||
|
|
||||||
|
MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", os.getenv("MQTT_BROKER_URL", "mqtt"))
|
||||||
|
MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883")))
|
||||||
|
MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME")
|
||||||
|
MQTT_PASSWORD = os.getenv("MQTT_PASSWORD")
|
||||||
|
MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||||
|
MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT")
|
||||||
|
MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE")
|
||||||
|
MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE")
|
||||||
|
MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||||
|
|
||||||
|
|
||||||
def _to_utc_z(dt: datetime.datetime) -> str:
|
def _to_utc_z(dt: datetime.datetime) -> str:
|
||||||
@@ -224,6 +240,19 @@ def main():
|
|||||||
client = mqtt.Client(callback_api_version=mqtt.CallbackAPIVersion.VERSION2)
|
client = mqtt.Client(callback_api_version=mqtt.CallbackAPIVersion.VERSION2)
|
||||||
client.reconnect_delay_set(min_delay=1, max_delay=30)
|
client.reconnect_delay_set(min_delay=1, max_delay=30)
|
||||||
|
|
||||||
|
if MQTT_USERNAME and MQTT_PASSWORD:
|
||||||
|
client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD)
|
||||||
|
|
||||||
|
if MQTT_TLS_ENABLED:
|
||||||
|
client.tls_set(
|
||||||
|
ca_certs=MQTT_TLS_CA_CERT,
|
||||||
|
certfile=MQTT_TLS_CERTFILE,
|
||||||
|
keyfile=MQTT_TLS_KEYFILE,
|
||||||
|
cert_reqs=ssl.CERT_REQUIRED,
|
||||||
|
)
|
||||||
|
if MQTT_TLS_INSECURE:
|
||||||
|
client.tls_insecure_set(True)
|
||||||
|
|
||||||
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL_SECONDS", "30"))
|
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL_SECONDS", "30"))
|
||||||
# 0 = aus; z.B. 600 für alle 10 Min
|
# 0 = aus; z.B. 600 für alle 10 Min
|
||||||
# initial value from DB or fallback to env
|
# initial value from DB or fallback to env
|
||||||
@@ -238,16 +267,21 @@ def main():
|
|||||||
POWER_INTENT_HEARTBEAT_ENABLED = _env_bool("POWER_INTENT_HEARTBEAT_ENABLED", True)
|
POWER_INTENT_HEARTBEAT_ENABLED = _env_bool("POWER_INTENT_HEARTBEAT_ENABLED", True)
|
||||||
POWER_INTENT_EXPIRY_MULTIPLIER = int(os.getenv("POWER_INTENT_EXPIRY_MULTIPLIER", "3"))
|
POWER_INTENT_EXPIRY_MULTIPLIER = int(os.getenv("POWER_INTENT_EXPIRY_MULTIPLIER", "3"))
|
||||||
POWER_INTENT_MIN_EXPIRY_SECONDS = int(os.getenv("POWER_INTENT_MIN_EXPIRY_SECONDS", "90"))
|
POWER_INTENT_MIN_EXPIRY_SECONDS = int(os.getenv("POWER_INTENT_MIN_EXPIRY_SECONDS", "90"))
|
||||||
|
CRASH_RECOVERY_ENABLED = _env_bool("CRASH_RECOVERY_ENABLED", False)
|
||||||
|
CRASH_RECOVERY_GRACE_SECONDS = int(os.getenv("CRASH_RECOVERY_GRACE_SECONDS", "180"))
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
"Scheduler config: poll_interval=%ss refresh_seconds=%s power_intent_enabled=%s "
|
"Scheduler config: poll_interval=%ss refresh_seconds=%s power_intent_enabled=%s "
|
||||||
"power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss",
|
"power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss "
|
||||||
|
"crash_recovery_enabled=%s crash_recovery_grace=%ss",
|
||||||
POLL_INTERVAL,
|
POLL_INTERVAL,
|
||||||
REFRESH_SECONDS,
|
REFRESH_SECONDS,
|
||||||
POWER_INTENT_PUBLISH_ENABLED,
|
POWER_INTENT_PUBLISH_ENABLED,
|
||||||
POWER_INTENT_HEARTBEAT_ENABLED,
|
POWER_INTENT_HEARTBEAT_ENABLED,
|
||||||
POWER_INTENT_EXPIRY_MULTIPLIER,
|
POWER_INTENT_EXPIRY_MULTIPLIER,
|
||||||
POWER_INTENT_MIN_EXPIRY_SECONDS,
|
POWER_INTENT_MIN_EXPIRY_SECONDS,
|
||||||
|
CRASH_RECOVERY_ENABLED,
|
||||||
|
CRASH_RECOVERY_GRACE_SECONDS,
|
||||||
)
|
)
|
||||||
# Konfigurierbares Zeitfenster in Tagen (Standard: 7)
|
# Konfigurierbares Zeitfenster in Tagen (Standard: 7)
|
||||||
WINDOW_DAYS = int(os.getenv("EVENTS_WINDOW_DAYS", "7"))
|
WINDOW_DAYS = int(os.getenv("EVENTS_WINDOW_DAYS", "7"))
|
||||||
@@ -275,8 +309,15 @@ def main():
|
|||||||
|
|
||||||
client.on_connect = on_connect
|
client.on_connect = on_connect
|
||||||
|
|
||||||
client.connect("mqtt", 1883)
|
client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT)
|
||||||
client.loop_start()
|
client.loop_start()
|
||||||
|
logging.info(
|
||||||
|
"MQTT connection configured host=%s port=%s tls=%s auth=%s",
|
||||||
|
MQTT_BROKER_HOST,
|
||||||
|
MQTT_BROKER_PORT,
|
||||||
|
MQTT_TLS_ENABLED,
|
||||||
|
bool(MQTT_USERNAME and MQTT_PASSWORD),
|
||||||
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
now = datetime.datetime.now(datetime.timezone.utc)
|
now = datetime.datetime.now(datetime.timezone.utc)
|
||||||
@@ -390,6 +431,51 @@ def main():
|
|||||||
power_intent_metrics["retained_republish_total"],
|
power_intent_metrics["retained_republish_total"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if CRASH_RECOVERY_ENABLED:
|
||||||
|
try:
|
||||||
|
candidates = get_crash_recovery_candidates(CRASH_RECOVERY_GRACE_SECONDS)
|
||||||
|
if candidates:
|
||||||
|
logging.info("event=crash_recovery_scan candidates=%s", len(candidates))
|
||||||
|
for candidate in candidates:
|
||||||
|
cuuid = candidate["uuid"]
|
||||||
|
reason = candidate["reason"]
|
||||||
|
try:
|
||||||
|
command_id, payload, topic, compat_topic = issue_crash_recovery_command(
|
||||||
|
client_uuid=cuuid,
|
||||||
|
reason=reason,
|
||||||
|
)
|
||||||
|
result = client.publish(topic, json.dumps(payload), qos=1, retain=False)
|
||||||
|
result.wait_for_publish(timeout=5.0)
|
||||||
|
compat_result = client.publish(compat_topic, json.dumps(payload), qos=1, retain=False)
|
||||||
|
compat_result.wait_for_publish(timeout=5.0)
|
||||||
|
success = result.rc == mqtt.MQTT_ERR_SUCCESS
|
||||||
|
error = None if success else mqtt.error_string(result.rc)
|
||||||
|
finalize_crash_recovery_command(command_id, published=success, error=error)
|
||||||
|
if success:
|
||||||
|
logging.info(
|
||||||
|
"event=crash_recovery_command_issued client_uuid=%s reason=%s command_id=%s",
|
||||||
|
cuuid, reason, command_id,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.error(
|
||||||
|
"event=crash_recovery_publish_failed client_uuid=%s reason=%s command_id=%s error=%s",
|
||||||
|
cuuid, reason, command_id, error,
|
||||||
|
)
|
||||||
|
except Exception as cmd_exc:
|
||||||
|
logging.error(
|
||||||
|
"event=crash_recovery_command_error client_uuid=%s reason=%s error=%s",
|
||||||
|
cuuid, reason, cmd_exc,
|
||||||
|
)
|
||||||
|
except Exception as scan_exc:
|
||||||
|
logging.error("event=crash_recovery_scan_error error=%s", scan_exc)
|
||||||
|
|
||||||
|
try:
|
||||||
|
expired_count = sweep_expired_commands()
|
||||||
|
if expired_count:
|
||||||
|
logging.info("event=command_expiry_sweep expired=%s", expired_count)
|
||||||
|
except Exception as sweep_exc:
|
||||||
|
logging.error("event=command_expiry_sweep_error error=%s", sweep_exc)
|
||||||
|
|
||||||
time.sleep(POLL_INTERVAL)
|
time.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,63 @@
|
|||||||
|
"""add client commands table
|
||||||
|
|
||||||
|
Revision ID: aa12bb34cc56
|
||||||
|
Revises: f3c4d5e6a7b8
|
||||||
|
Create Date: 2026-04-03 12:40:00.000000
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = 'aa12bb34cc56'
|
||||||
|
down_revision: Union[str, None] = 'f3c4d5e6a7b8'
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
'client_commands',
|
||||||
|
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
|
||||||
|
sa.Column('command_id', sa.String(length=36), nullable=False),
|
||||||
|
sa.Column('client_uuid', sa.String(length=36), nullable=False),
|
||||||
|
sa.Column('action', sa.String(length=32), nullable=False),
|
||||||
|
sa.Column('status', sa.String(length=40), nullable=False),
|
||||||
|
sa.Column('reason', sa.Text(), nullable=True),
|
||||||
|
sa.Column('requested_by', sa.Integer(), nullable=True),
|
||||||
|
sa.Column('issued_at', sa.TIMESTAMP(timezone=True), nullable=False),
|
||||||
|
sa.Column('expires_at', sa.TIMESTAMP(timezone=True), nullable=False),
|
||||||
|
sa.Column('published_at', sa.TIMESTAMP(timezone=True), nullable=True),
|
||||||
|
sa.Column('acked_at', sa.TIMESTAMP(timezone=True), nullable=True),
|
||||||
|
sa.Column('execution_started_at', sa.TIMESTAMP(timezone=True), nullable=True),
|
||||||
|
sa.Column('completed_at', sa.TIMESTAMP(timezone=True), nullable=True),
|
||||||
|
sa.Column('failed_at', sa.TIMESTAMP(timezone=True), nullable=True),
|
||||||
|
sa.Column('error_code', sa.String(length=64), nullable=True),
|
||||||
|
sa.Column('error_message', sa.Text(), nullable=True),
|
||||||
|
sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.current_timestamp(), nullable=False),
|
||||||
|
sa.Column('updated_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.current_timestamp(), nullable=False),
|
||||||
|
sa.ForeignKeyConstraint(['client_uuid'], ['clients.uuid'], ondelete='CASCADE'),
|
||||||
|
sa.ForeignKeyConstraint(['requested_by'], ['users.id'], ondelete='SET NULL'),
|
||||||
|
sa.PrimaryKeyConstraint('id'),
|
||||||
|
sa.UniqueConstraint('command_id')
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_index(op.f('ix_client_commands_action'), 'client_commands', ['action'], unique=False)
|
||||||
|
op.create_index(op.f('ix_client_commands_client_uuid'), 'client_commands', ['client_uuid'], unique=False)
|
||||||
|
op.create_index(op.f('ix_client_commands_command_id'), 'client_commands', ['command_id'], unique=False)
|
||||||
|
op.create_index(op.f('ix_client_commands_requested_by'), 'client_commands', ['requested_by'], unique=False)
|
||||||
|
op.create_index(op.f('ix_client_commands_status'), 'client_commands', ['status'], unique=False)
|
||||||
|
op.create_index('ix_client_commands_client_status_created', 'client_commands', ['client_uuid', 'status', 'created_at'], unique=False)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index('ix_client_commands_client_status_created', table_name='client_commands')
|
||||||
|
op.drop_index(op.f('ix_client_commands_status'), table_name='client_commands')
|
||||||
|
op.drop_index(op.f('ix_client_commands_requested_by'), table_name='client_commands')
|
||||||
|
op.drop_index(op.f('ix_client_commands_command_id'), table_name='client_commands')
|
||||||
|
op.drop_index(op.f('ix_client_commands_client_uuid'), table_name='client_commands')
|
||||||
|
op.drop_index(op.f('ix_client_commands_action'), table_name='client_commands')
|
||||||
|
op.drop_table('client_commands')
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
"""add service_failed and mqtt broker connection fields to clients
|
||||||
|
|
||||||
|
Revision ID: b1c2d3e4f5a6
|
||||||
|
Revises: aa12bb34cc56
|
||||||
|
Create Date: 2026-04-05 10:00:00.000000
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = 'b1c2d3e4f5a6'
|
||||||
|
down_revision: Union[str, None] = 'aa12bb34cc56'
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
bind = op.get_bind()
|
||||||
|
inspector = sa.inspect(bind)
|
||||||
|
existing = {c['name'] for c in inspector.get_columns('clients')}
|
||||||
|
|
||||||
|
# Systemd service-failed tracking
|
||||||
|
if 'service_failed_at' not in existing:
|
||||||
|
op.add_column('clients', sa.Column('service_failed_at', sa.TIMESTAMP(timezone=True), nullable=True))
|
||||||
|
if 'service_failed_unit' not in existing:
|
||||||
|
op.add_column('clients', sa.Column('service_failed_unit', sa.String(128), nullable=True))
|
||||||
|
|
||||||
|
# MQTT broker connection health
|
||||||
|
if 'mqtt_reconnect_count' not in existing:
|
||||||
|
op.add_column('clients', sa.Column('mqtt_reconnect_count', sa.Integer(), nullable=True))
|
||||||
|
if 'mqtt_last_disconnect_at' not in existing:
|
||||||
|
op.add_column('clients', sa.Column('mqtt_last_disconnect_at', sa.TIMESTAMP(timezone=True), nullable=True))
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_column('clients', 'mqtt_last_disconnect_at')
|
||||||
|
op.drop_column('clients', 'mqtt_reconnect_count')
|
||||||
|
op.drop_column('clients', 'service_failed_unit')
|
||||||
|
op.drop_column('clients', 'service_failed_at')
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
from sqlalchemy import create_engine, text
|
from sqlalchemy import create_engine, textmosquitto.conf
|
||||||
import os
|
import os
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import bcrypt
|
import bcrypt
|
||||||
|
|||||||
@@ -421,6 +421,8 @@ def get_monitoring_overview():
|
|||||||
},
|
},
|
||||||
"latest_log": _serialize_log_entry(latest_log),
|
"latest_log": _serialize_log_entry(latest_log),
|
||||||
"latest_error": _serialize_log_entry(latest_error),
|
"latest_error": _serialize_log_entry(latest_error),
|
||||||
|
"mqtt_reconnect_count": client.mqtt_reconnect_count,
|
||||||
|
"mqtt_last_disconnect_at": client.mqtt_last_disconnect_at.isoformat() if client.mqtt_last_disconnect_at else None,
|
||||||
})
|
})
|
||||||
|
|
||||||
summary_counts["total_clients"] += 1
|
summary_counts["total_clients"] += 1
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
from server.database import Session
|
from server.database import Session
|
||||||
from models.models import Client, ClientGroup
|
from models.models import Client, ClientGroup, ClientCommand, ProcessStatus
|
||||||
from flask import Blueprint, request, jsonify
|
from flask import Blueprint, request, jsonify, session as flask_session
|
||||||
from server.permissions import admin_or_higher
|
from server.permissions import admin_or_higher
|
||||||
|
from server.routes.groups import get_grace_period, is_client_alive
|
||||||
from server.mqtt_helper import publish_client_group, delete_client_group_message, publish_multiple_client_groups
|
from server.mqtt_helper import publish_client_group, delete_client_group_message, publish_multiple_client_groups
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
@@ -9,13 +10,196 @@ import glob
|
|||||||
import base64
|
import base64
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
from datetime import datetime, timezone
|
import uuid as uuid_lib
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
sys.path.append('/workspace')
|
sys.path.append('/workspace')
|
||||||
|
|
||||||
clients_bp = Blueprint("clients", __name__, url_prefix="/api/clients")
|
clients_bp = Blueprint("clients", __name__, url_prefix="/api/clients")
|
||||||
|
|
||||||
VALID_SCREENSHOT_TYPES = {"periodic", "event_start", "event_stop"}
|
VALID_SCREENSHOT_TYPES = {"periodic", "event_start", "event_stop"}
|
||||||
|
|
||||||
|
COMMAND_SCHEMA_VERSION = "1.0"
|
||||||
|
COMMAND_TOPIC_TEMPLATE = "infoscreen/{uuid}/commands"
|
||||||
|
COMMAND_TOPIC_COMPAT_TEMPLATE = "infoscreen/{uuid}/command"
|
||||||
|
LEGACY_RESTART_TOPIC_TEMPLATE = "clients/{uuid}/restart"
|
||||||
|
COMMAND_EXPIRY_SECONDS = 240
|
||||||
|
REBOOT_LOCKOUT_WINDOW_MINUTES = 15
|
||||||
|
REBOOT_LOCKOUT_THRESHOLD = 3
|
||||||
|
API_ACTION_TO_COMMAND_ACTION = {
|
||||||
|
"restart": "reboot_host",
|
||||||
|
"shutdown": "shutdown_host",
|
||||||
|
"restart_app": "restart_app",
|
||||||
|
}
|
||||||
|
ALLOWED_COMMAND_ACTIONS = set(API_ACTION_TO_COMMAND_ACTION.keys())
|
||||||
|
|
||||||
|
|
||||||
|
def _iso_utc_z(ts: datetime) -> str:
|
||||||
|
return ts.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||||
|
|
||||||
|
|
||||||
|
def _command_to_dict(command: ClientCommand) -> dict:
|
||||||
|
return {
|
||||||
|
"commandId": command.command_id,
|
||||||
|
"clientUuid": command.client_uuid,
|
||||||
|
"action": command.action,
|
||||||
|
"status": command.status,
|
||||||
|
"reason": command.reason,
|
||||||
|
"requestedBy": command.requested_by,
|
||||||
|
"issuedAt": command.issued_at.isoformat() if command.issued_at else None,
|
||||||
|
"expiresAt": command.expires_at.isoformat() if command.expires_at else None,
|
||||||
|
"publishedAt": command.published_at.isoformat() if command.published_at else None,
|
||||||
|
"ackedAt": command.acked_at.isoformat() if command.acked_at else None,
|
||||||
|
"executionStartedAt": command.execution_started_at.isoformat() if command.execution_started_at else None,
|
||||||
|
"completedAt": command.completed_at.isoformat() if command.completed_at else None,
|
||||||
|
"failedAt": command.failed_at.isoformat() if command.failed_at else None,
|
||||||
|
"errorCode": command.error_code,
|
||||||
|
"errorMessage": command.error_message,
|
||||||
|
"createdAt": command.created_at.isoformat() if command.created_at else None,
|
||||||
|
"updatedAt": command.updated_at.isoformat() if command.updated_at else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _publish_client_command(client_uuid: str, action: str, payload: dict) -> None:
|
||||||
|
import paho.mqtt.client as mqtt
|
||||||
|
|
||||||
|
broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt")
|
||||||
|
broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883))
|
||||||
|
username = os.getenv("MQTT_USER")
|
||||||
|
password = os.getenv("MQTT_PASSWORD")
|
||||||
|
|
||||||
|
mqtt_client = mqtt.Client()
|
||||||
|
if username and password:
|
||||||
|
mqtt_client.username_pw_set(username, password)
|
||||||
|
|
||||||
|
mqtt_client.connect(broker_host, broker_port)
|
||||||
|
|
||||||
|
# Primary topic for contract-based command handling.
|
||||||
|
command_topic = COMMAND_TOPIC_TEMPLATE.format(uuid=client_uuid)
|
||||||
|
result = mqtt_client.publish(command_topic, json.dumps(payload), qos=1, retain=False)
|
||||||
|
result.wait_for_publish(timeout=5.0)
|
||||||
|
|
||||||
|
# Transitional compatibility for clients that still consume singular topic naming.
|
||||||
|
compat_topic = COMMAND_TOPIC_COMPAT_TEMPLATE.format(uuid=client_uuid)
|
||||||
|
compat_result = mqtt_client.publish(compat_topic, json.dumps(payload), qos=1, retain=False)
|
||||||
|
compat_result.wait_for_publish(timeout=5.0)
|
||||||
|
|
||||||
|
# Transitional compatibility for existing restart-only clients.
|
||||||
|
if action == "restart":
|
||||||
|
legacy_topic = LEGACY_RESTART_TOPIC_TEMPLATE.format(uuid=client_uuid)
|
||||||
|
legacy_payload = {"action": "restart"}
|
||||||
|
legacy_result = mqtt_client.publish(legacy_topic, json.dumps(legacy_payload), qos=1, retain=False)
|
||||||
|
legacy_result.wait_for_publish(timeout=5.0)
|
||||||
|
|
||||||
|
mqtt_client.disconnect()
|
||||||
|
|
||||||
|
|
||||||
|
def _issue_client_command(client_uuid: str, action: str):
|
||||||
|
if action not in ALLOWED_COMMAND_ACTIONS:
|
||||||
|
return jsonify({"error": f"Unsupported action '{action}'"}), 400
|
||||||
|
|
||||||
|
command_action = API_ACTION_TO_COMMAND_ACTION[action]
|
||||||
|
|
||||||
|
data = request.get_json(silent=True) or {}
|
||||||
|
reason = str(data.get("reason", "")).strip() or None
|
||||||
|
requested_by = flask_session.get("user_id")
|
||||||
|
|
||||||
|
now_utc = datetime.now(timezone.utc)
|
||||||
|
expires_at = now_utc + timedelta(seconds=COMMAND_EXPIRY_SECONDS)
|
||||||
|
command_id = str(uuid_lib.uuid4())
|
||||||
|
|
||||||
|
db = Session()
|
||||||
|
try:
|
||||||
|
client = db.query(Client).filter_by(uuid=client_uuid).first()
|
||||||
|
if not client:
|
||||||
|
return jsonify({"error": "Client nicht gefunden"}), 404
|
||||||
|
|
||||||
|
# Safety lockout: avoid rapid repeated reboot loops per client.
|
||||||
|
if command_action in ("reboot_host", "restart_app"):
|
||||||
|
window_start = now_utc - timedelta(minutes=REBOOT_LOCKOUT_WINDOW_MINUTES)
|
||||||
|
recent_reboots = (
|
||||||
|
db.query(ClientCommand)
|
||||||
|
.filter(ClientCommand.client_uuid == client_uuid)
|
||||||
|
.filter(ClientCommand.action.in_(["reboot_host", "restart_app"]))
|
||||||
|
.filter(ClientCommand.created_at >= window_start)
|
||||||
|
.count()
|
||||||
|
)
|
||||||
|
if recent_reboots >= REBOOT_LOCKOUT_THRESHOLD:
|
||||||
|
blocked = ClientCommand(
|
||||||
|
command_id=command_id,
|
||||||
|
client_uuid=client_uuid,
|
||||||
|
action=command_action,
|
||||||
|
status="blocked_safety",
|
||||||
|
reason=reason,
|
||||||
|
requested_by=requested_by,
|
||||||
|
issued_at=now_utc,
|
||||||
|
expires_at=expires_at,
|
||||||
|
failed_at=now_utc,
|
||||||
|
error_code="lockout_threshold",
|
||||||
|
error_message="Reboot lockout active for this client",
|
||||||
|
)
|
||||||
|
db.add(blocked)
|
||||||
|
db.commit()
|
||||||
|
return jsonify({
|
||||||
|
"success": False,
|
||||||
|
"message": "Neustart voruebergehend blockiert (Sicherheits-Lockout)",
|
||||||
|
"command": _command_to_dict(blocked),
|
||||||
|
}), 429
|
||||||
|
|
||||||
|
command = ClientCommand(
|
||||||
|
command_id=command_id,
|
||||||
|
client_uuid=client_uuid,
|
||||||
|
action=command_action,
|
||||||
|
status="queued",
|
||||||
|
reason=reason,
|
||||||
|
requested_by=requested_by,
|
||||||
|
issued_at=now_utc,
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
db.add(command)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
command.status = "publish_in_progress"
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"schema_version": COMMAND_SCHEMA_VERSION,
|
||||||
|
"command_id": command.command_id,
|
||||||
|
"client_uuid": command.client_uuid,
|
||||||
|
"action": command.action,
|
||||||
|
"issued_at": _iso_utc_z(command.issued_at),
|
||||||
|
"expires_at": _iso_utc_z(command.expires_at),
|
||||||
|
"requested_by": command.requested_by,
|
||||||
|
"reason": command.reason,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
_publish_client_command(client_uuid=client_uuid, action=action, payload=payload)
|
||||||
|
# ACK can arrive very quickly (including terminal failure) while publish is in-flight.
|
||||||
|
# Refresh to avoid regressing a newer listener-updated state back to "published".
|
||||||
|
db.refresh(command)
|
||||||
|
command.published_at = command.published_at or datetime.now(timezone.utc)
|
||||||
|
if command.status in {"queued", "publish_in_progress"}:
|
||||||
|
command.status = "published"
|
||||||
|
db.commit()
|
||||||
|
return jsonify({
|
||||||
|
"success": True,
|
||||||
|
"message": f"Command published for client {client_uuid}",
|
||||||
|
"command": _command_to_dict(command),
|
||||||
|
}), 202
|
||||||
|
except Exception as publish_error:
|
||||||
|
command.status = "failed"
|
||||||
|
command.failed_at = datetime.now(timezone.utc)
|
||||||
|
command.error_code = "mqtt_publish_failed"
|
||||||
|
command.error_message = str(publish_error)
|
||||||
|
db.commit()
|
||||||
|
return jsonify({
|
||||||
|
"success": False,
|
||||||
|
"error": f"Failed to publish command: {publish_error}",
|
||||||
|
"command": _command_to_dict(command),
|
||||||
|
}), 500
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
def _normalize_screenshot_type(raw_type):
|
def _normalize_screenshot_type(raw_type):
|
||||||
if raw_type is None:
|
if raw_type is None:
|
||||||
@@ -280,45 +464,148 @@ def get_clients_with_alive_status():
|
|||||||
"ip": c.ip,
|
"ip": c.ip,
|
||||||
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
|
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
|
||||||
"is_active": c.is_active,
|
"is_active": c.is_active,
|
||||||
"is_alive": bool(c.last_alive and c.is_active),
|
"is_alive": is_client_alive(c.last_alive, c.is_active),
|
||||||
})
|
})
|
||||||
session.close()
|
session.close()
|
||||||
return jsonify(result)
|
return jsonify(result)
|
||||||
|
|
||||||
|
|
||||||
|
@clients_bp.route("/crashed", methods=["GET"])
|
||||||
|
@admin_or_higher
|
||||||
|
def get_crashed_clients():
|
||||||
|
"""Returns clients that are crashed (process_status=crashed) or heartbeat-stale."""
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
from datetime import timedelta
|
||||||
|
grace = get_grace_period()
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
stale_cutoff = datetime.now(timezone.utc) - timedelta(seconds=grace)
|
||||||
|
clients = (
|
||||||
|
session.query(Client)
|
||||||
|
.filter(Client.is_active == True)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
result = []
|
||||||
|
for c in clients:
|
||||||
|
alive = is_client_alive(c.last_alive, c.is_active)
|
||||||
|
crashed = c.process_status == ProcessStatus.crashed
|
||||||
|
if not alive or crashed:
|
||||||
|
result.append({
|
||||||
|
"uuid": c.uuid,
|
||||||
|
"description": c.description,
|
||||||
|
"hostname": c.hostname,
|
||||||
|
"ip": c.ip,
|
||||||
|
"group_id": c.group_id,
|
||||||
|
"is_alive": alive,
|
||||||
|
"process_status": c.process_status.value if c.process_status else None,
|
||||||
|
"screen_health_status": c.screen_health_status.value if c.screen_health_status else None,
|
||||||
|
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
|
||||||
|
"crash_reason": "process_crashed" if crashed else "heartbeat_stale",
|
||||||
|
})
|
||||||
|
return jsonify({
|
||||||
|
"crashed_count": len(result),
|
||||||
|
"grace_period_seconds": grace,
|
||||||
|
"clients": result,
|
||||||
|
})
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
@clients_bp.route("/service_failed", methods=["GET"])
|
||||||
|
@admin_or_higher
|
||||||
|
def get_service_failed_clients():
|
||||||
|
"""Returns clients that have a service_failed_at set (systemd gave up restarting)."""
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
clients = (
|
||||||
|
session.query(Client)
|
||||||
|
.filter(Client.service_failed_at.isnot(None))
|
||||||
|
.order_by(Client.service_failed_at.desc())
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
result = [
|
||||||
|
{
|
||||||
|
"uuid": c.uuid,
|
||||||
|
"description": c.description,
|
||||||
|
"hostname": c.hostname,
|
||||||
|
"ip": c.ip,
|
||||||
|
"group_id": c.group_id,
|
||||||
|
"service_failed_at": c.service_failed_at.isoformat() if c.service_failed_at else None,
|
||||||
|
"service_failed_unit": c.service_failed_unit,
|
||||||
|
"is_alive": is_client_alive(c.last_alive, c.is_active),
|
||||||
|
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
|
||||||
|
}
|
||||||
|
for c in clients
|
||||||
|
]
|
||||||
|
return jsonify({"service_failed_count": len(result), "clients": result})
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
@clients_bp.route("/<client_uuid>/clear_service_failed", methods=["POST"])
|
||||||
|
@admin_or_higher
|
||||||
|
def clear_service_failed(client_uuid):
|
||||||
|
"""Clears the service_failed flag for a client and deletes the retained MQTT message."""
|
||||||
|
import paho.mqtt.client as mqtt_lib
|
||||||
|
|
||||||
|
session = Session()
|
||||||
|
try:
|
||||||
|
c = session.query(Client).filter_by(uuid=client_uuid).first()
|
||||||
|
if not c:
|
||||||
|
return jsonify({"error": "Client nicht gefunden"}), 404
|
||||||
|
if c.service_failed_at is None:
|
||||||
|
return jsonify({"success": True, "message": "Kein service_failed Flag gesetzt."}), 200
|
||||||
|
|
||||||
|
c.service_failed_at = None
|
||||||
|
c.service_failed_unit = None
|
||||||
|
session.commit()
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
# Clear the retained MQTT message (publish empty payload, retained=True)
|
||||||
|
try:
|
||||||
|
broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt")
|
||||||
|
broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883))
|
||||||
|
username = os.getenv("MQTT_USER")
|
||||||
|
password = os.getenv("MQTT_PASSWORD")
|
||||||
|
mc = mqtt_lib.Client()
|
||||||
|
if username and password:
|
||||||
|
mc.username_pw_set(username, password)
|
||||||
|
mc.connect(broker_host, broker_port)
|
||||||
|
topic = f"infoscreen/{client_uuid}/service_failed"
|
||||||
|
mc.publish(topic, payload=None, qos=1, retain=True)
|
||||||
|
mc.disconnect()
|
||||||
|
except Exception as e:
|
||||||
|
# Log but don't fail — DB is already cleared
|
||||||
|
import logging
|
||||||
|
logging.warning(f"Could not clear retained service_failed MQTT message for {client_uuid}: {e}")
|
||||||
|
|
||||||
|
return jsonify({"success": True, "message": "service_failed Flag gelöscht."})
|
||||||
|
|
||||||
|
|
||||||
@clients_bp.route("/<uuid>/restart", methods=["POST"])
|
@clients_bp.route("/<uuid>/restart", methods=["POST"])
|
||||||
@admin_or_higher
|
@admin_or_higher
|
||||||
def restart_client(uuid):
|
def restart_client(uuid):
|
||||||
"""
|
return _issue_client_command(client_uuid=uuid, action="restart")
|
||||||
Route to restart a specific client by UUID.
|
|
||||||
Sends an MQTT message to the broker to trigger the restart.
|
|
||||||
"""
|
|
||||||
import paho.mqtt.client as mqtt
|
|
||||||
import json
|
|
||||||
|
|
||||||
# MQTT broker configuration
|
|
||||||
MQTT_BROKER = "mqtt"
|
|
||||||
MQTT_PORT = 1883
|
|
||||||
MQTT_TOPIC = f"clients/{uuid}/restart"
|
|
||||||
|
|
||||||
# Connect to the database to check if the client exists
|
@clients_bp.route("/<uuid>/shutdown", methods=["POST"])
|
||||||
session = Session()
|
@admin_or_higher
|
||||||
client = session.query(Client).filter_by(uuid=uuid).first()
|
def shutdown_client(uuid):
|
||||||
if not client:
|
return _issue_client_command(client_uuid=uuid, action="shutdown")
|
||||||
session.close()
|
|
||||||
return jsonify({"error": "Client nicht gefunden"}), 404
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
# Send MQTT message
|
|
||||||
|
@clients_bp.route("/commands/<command_id>", methods=["GET"])
|
||||||
|
@admin_or_higher
|
||||||
|
def get_client_command_status(command_id):
|
||||||
|
db = Session()
|
||||||
try:
|
try:
|
||||||
mqtt_client = mqtt.Client()
|
command = db.query(ClientCommand).filter_by(command_id=command_id).first()
|
||||||
mqtt_client.connect(MQTT_BROKER, MQTT_PORT)
|
if not command:
|
||||||
payload = {"action": "restart"}
|
return jsonify({"error": "Command nicht gefunden"}), 404
|
||||||
mqtt_client.publish(MQTT_TOPIC, json.dumps(payload))
|
return jsonify(_command_to_dict(command)), 200
|
||||||
mqtt_client.disconnect()
|
finally:
|
||||||
return jsonify({"success": True, "message": f"Restart signal sent to client {uuid}"}), 200
|
db.close()
|
||||||
except Exception as e:
|
|
||||||
return jsonify({"error": f"Failed to send MQTT message: {str(e)}"}), 500
|
|
||||||
|
|
||||||
|
|
||||||
@clients_bp.route("/<uuid>/screenshot", methods=["POST"])
|
@clients_bp.route("/<uuid>/screenshot", methods=["POST"])
|
||||||
|
|||||||
Reference in New Issue
Block a user