feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep

- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
2026-04-05 10:17:56 +00:00
parent 4d652f0554
commit 03e3c11e90
35 changed files with 2511 additions and 80 deletions
--- a/scheduler/db_utils.py
+++ b/scheduler/db_utils.py
@@ -1,13 +1,14 @@
 # scheduler/db_utils.py
 from dotenv import load_dotenv
 import os
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 import hashlib
 import json
 import logging
 from sqlalchemy.orm import sessionmaker, joinedload
 from sqlalchemy import create_engine, or_, and_, text
-from models.models import Event, EventMedia, EventException, SystemSetting
+import uuid as _uuid_mod
+from models.models import Event, EventMedia, EventException, SystemSetting, Client, ClientCommand, ProcessStatus
 from dateutil.rrule import rrulestr
 from urllib.request import Request, urlopen
 from datetime import timezone
@@ -454,3 +455,167 @@ def format_event_with_media(event):
        # Add other event types (message, etc.) here as needed...

    return event_dict
+
+
+# ---------------------------------------------------------------------------
+# Crash detection / auto-recovery helpers
+# ---------------------------------------------------------------------------
+
+_CRASH_RECOVERY_SCHEMA_VERSION = "1.0"
+_CRASH_COMMAND_TOPIC = "infoscreen/{uuid}/commands"
+_CRASH_COMMAND_COMPAT_TOPIC = "infoscreen/{uuid}/command"
+_CRASH_RECOVERY_EXPIRY_SECONDS = int(os.getenv("CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS", "240"))
+_CRASH_RECOVERY_LOCKOUT_MINUTES = int(os.getenv("CRASH_RECOVERY_LOCKOUT_MINUTES", "15"))
+
+
+def get_crash_recovery_candidates(heartbeat_grace_seconds: int) -> list:
+    """
+    Returns a list of dicts for active clients that are crashed (process_status=crashed)
+    or heartbeat-stale, and don't already have a recent recovery command in the lockout window.
+    """
+    session = Session()
+    try:
+        now = datetime.now(timezone.utc)
+        stale_cutoff = now - timedelta(seconds=heartbeat_grace_seconds)
+        lockout_cutoff = now - timedelta(minutes=_CRASH_RECOVERY_LOCKOUT_MINUTES)
+
+        candidates = (
+            session.query(Client)
+            .filter(Client.is_active == True)
+            .filter(
+                or_(
+                    Client.process_status == ProcessStatus.crashed,
+                    Client.last_alive < stale_cutoff,
+                )
+            )
+            .all()
+        )
+
+        result = []
+        for c in candidates:
+            recent = (
+                session.query(ClientCommand)
+                .filter(ClientCommand.client_uuid == c.uuid)
+                .filter(ClientCommand.created_at >= lockout_cutoff)
+                .filter(ClientCommand.action.in_(["reboot_host", "restart_app"]))
+                .first()
+            )
+            if recent:
+                continue
+            crash_reason = (
+                "process_crashed"
+                if c.process_status == ProcessStatus.crashed
+                else "heartbeat_stale"
+            )
+            result.append({
+                "uuid": c.uuid,
+                "reason": crash_reason,
+                "process_status": c.process_status.value if c.process_status else None,
+                "last_alive": c.last_alive,
+            })
+        return result
+    finally:
+        session.close()
+
+
+def issue_crash_recovery_command(client_uuid: str, reason: str) -> tuple:
+    """
+    Writes a ClientCommand (reboot_host) for crash recovery to the DB.
+    Returns (command_id, payload_dict) for the caller to publish over MQTT.
+    Also returns the MQTT topic strings.
+    """
+    session = Session()
+    try:
+        now = datetime.now(timezone.utc)
+        expires_at = now + timedelta(seconds=_CRASH_RECOVERY_EXPIRY_SECONDS)
+        command_id = str(_uuid_mod.uuid4())
+
+        command = ClientCommand(
+            command_id=command_id,
+            client_uuid=client_uuid,
+            action="reboot_host",
+            status="queued",
+            reason=reason,
+            requested_by=None,
+            issued_at=now,
+            expires_at=expires_at,
+        )
+        session.add(command)
+        session.commit()
+        command.status = "publish_in_progress"
+        session.commit()
+
+        payload = {
+            "schema_version": _CRASH_RECOVERY_SCHEMA_VERSION,
+            "command_id": command_id,
+            "client_uuid": client_uuid,
+            "action": "reboot_host",
+            "issued_at": now.isoformat().replace("+00:00", "Z"),
+            "expires_at": expires_at.isoformat().replace("+00:00", "Z"),
+            "requested_by": None,
+            "reason": reason,
+        }
+        topic = _CRASH_COMMAND_TOPIC.format(uuid=client_uuid)
+        compat_topic = _CRASH_COMMAND_COMPAT_TOPIC.format(uuid=client_uuid)
+        return command_id, payload, topic, compat_topic
+    except Exception:
+        session.rollback()
+        raise
+    finally:
+        session.close()
+
+
+def finalize_crash_recovery_command(command_id: str, published: bool, error: str = None) -> None:
+    """Updates command status after MQTT publish attempt."""
+    session = Session()
+    try:
+        cmd = session.query(ClientCommand).filter_by(command_id=command_id).first()
+        if not cmd:
+            return
+        now = datetime.now(timezone.utc)
+        if published:
+            cmd.status = "published"
+            cmd.published_at = now
+        else:
+            cmd.status = "failed"
+            cmd.failed_at = now
+            cmd.error_code = "mqtt_publish_failed"
+            cmd.error_message = error or "Unknown publish error"
+        session.commit()
+    finally:
+        session.close()
+
+
+_TERMINAL_COMMAND_STATUSES = {"completed", "failed", "expired", "canceled", "blocked_safety"}
+
+
+def sweep_expired_commands() -> int:
+    """Marks non-terminal commands whose expires_at has passed as expired.
+
+    Returns the number of commands updated.
+    """
+    session = Session()
+    try:
+        now = datetime.now(timezone.utc)
+        commands = (
+            session.query(ClientCommand)
+            .filter(
+                ClientCommand.expires_at < now,
+                ClientCommand.status.notin_(_TERMINAL_COMMAND_STATUSES),
+            )
+            .all()
+        )
+        if not commands:
+            return 0
+        for cmd in commands:
+            cmd.status = "expired"
+            cmd.failed_at = now
+            cmd.error_code = "expired"
+            cmd.error_message = "Command expired before terminal state was reached."
+        session.commit()
+        return len(commands)
+    except Exception:
+        session.rollback()
+        raise
+    finally:
+        session.close()