feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep

- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
2026-04-05 10:17:56 +00:00
parent 4d652f0554
commit 03e3c11e90
35 changed files with 2511 additions and 80 deletions
--- a/scheduler/scheduler.py
+++ b/scheduler/scheduler.py
@@ -8,12 +8,28 @@ from .db_utils import (
    compute_group_power_intent_basis,
    build_group_power_intent_body,
    compute_group_power_intent_fingerprint,
+    get_crash_recovery_candidates,
+    issue_crash_recovery_command,
+    finalize_crash_recovery_command,
+    sweep_expired_commands,
 )
 import paho.mqtt.client as mqtt
 import json
 import datetime
 import time
 import uuid
+import ssl
+
+
+MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", os.getenv("MQTT_BROKER_URL", "mqtt"))
+MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883")))
+MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME")
+MQTT_PASSWORD = os.getenv("MQTT_PASSWORD")
+MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on")
+MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT")
+MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE")
+MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE")
+MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on")


 def _to_utc_z(dt: datetime.datetime) -> str:
@@ -224,6 +240,19 @@ def main():
    client = mqtt.Client(callback_api_version=mqtt.CallbackAPIVersion.VERSION2)
    client.reconnect_delay_set(min_delay=1, max_delay=30)

+    if MQTT_USERNAME and MQTT_PASSWORD:
+        client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD)
+
+    if MQTT_TLS_ENABLED:
+        client.tls_set(
+            ca_certs=MQTT_TLS_CA_CERT,
+            certfile=MQTT_TLS_CERTFILE,
+            keyfile=MQTT_TLS_KEYFILE,
+            cert_reqs=ssl.CERT_REQUIRED,
+        )
+        if MQTT_TLS_INSECURE:
+            client.tls_insecure_set(True)
+
    POLL_INTERVAL = int(os.getenv("POLL_INTERVAL_SECONDS", "30"))
    # 0 = aus; z.B. 600 für alle 10 Min
    # initial value from DB or fallback to env
@@ -238,16 +267,21 @@ def main():
    POWER_INTENT_HEARTBEAT_ENABLED = _env_bool("POWER_INTENT_HEARTBEAT_ENABLED", True)
    POWER_INTENT_EXPIRY_MULTIPLIER = int(os.getenv("POWER_INTENT_EXPIRY_MULTIPLIER", "3"))
    POWER_INTENT_MIN_EXPIRY_SECONDS = int(os.getenv("POWER_INTENT_MIN_EXPIRY_SECONDS", "90"))
+    CRASH_RECOVERY_ENABLED = _env_bool("CRASH_RECOVERY_ENABLED", False)
+    CRASH_RECOVERY_GRACE_SECONDS = int(os.getenv("CRASH_RECOVERY_GRACE_SECONDS", "180"))

    logging.info(
        "Scheduler config: poll_interval=%ss refresh_seconds=%s power_intent_enabled=%s "
-        "power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss",
+        "power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss "
+        "crash_recovery_enabled=%s crash_recovery_grace=%ss",
        POLL_INTERVAL,
        REFRESH_SECONDS,
        POWER_INTENT_PUBLISH_ENABLED,
        POWER_INTENT_HEARTBEAT_ENABLED,
        POWER_INTENT_EXPIRY_MULTIPLIER,
        POWER_INTENT_MIN_EXPIRY_SECONDS,
+        CRASH_RECOVERY_ENABLED,
+        CRASH_RECOVERY_GRACE_SECONDS,
    )
    # Konfigurierbares Zeitfenster in Tagen (Standard: 7)
    WINDOW_DAYS = int(os.getenv("EVENTS_WINDOW_DAYS", "7"))
@@ -275,8 +309,15 @@ def main():

    client.on_connect = on_connect

-    client.connect("mqtt", 1883)
+    client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT)
    client.loop_start()
+    logging.info(
+        "MQTT connection configured host=%s port=%s tls=%s auth=%s",
+        MQTT_BROKER_HOST,
+        MQTT_BROKER_PORT,
+        MQTT_TLS_ENABLED,
+        bool(MQTT_USERNAME and MQTT_PASSWORD),
+    )

    while True:
        now = datetime.datetime.now(datetime.timezone.utc)
@@ -390,6 +431,51 @@ def main():
                power_intent_metrics["retained_republish_total"],
            )

+        if CRASH_RECOVERY_ENABLED:
+            try:
+                candidates = get_crash_recovery_candidates(CRASH_RECOVERY_GRACE_SECONDS)
+                if candidates:
+                    logging.info("event=crash_recovery_scan candidates=%s", len(candidates))
+                for candidate in candidates:
+                    cuuid = candidate["uuid"]
+                    reason = candidate["reason"]
+                    try:
+                        command_id, payload, topic, compat_topic = issue_crash_recovery_command(
+                            client_uuid=cuuid,
+                            reason=reason,
+                        )
+                        result = client.publish(topic, json.dumps(payload), qos=1, retain=False)
+                        result.wait_for_publish(timeout=5.0)
+                        compat_result = client.publish(compat_topic, json.dumps(payload), qos=1, retain=False)
+                        compat_result.wait_for_publish(timeout=5.0)
+                        success = result.rc == mqtt.MQTT_ERR_SUCCESS
+                        error = None if success else mqtt.error_string(result.rc)
+                        finalize_crash_recovery_command(command_id, published=success, error=error)
+                        if success:
+                            logging.info(
+                                "event=crash_recovery_command_issued client_uuid=%s reason=%s command_id=%s",
+                                cuuid, reason, command_id,
+                            )
+                        else:
+                            logging.error(
+                                "event=crash_recovery_publish_failed client_uuid=%s reason=%s command_id=%s error=%s",
+                                cuuid, reason, command_id, error,
+                            )
+                    except Exception as cmd_exc:
+                        logging.error(
+                            "event=crash_recovery_command_error client_uuid=%s reason=%s error=%s",
+                            cuuid, reason, cmd_exc,
+                        )
+            except Exception as scan_exc:
+                logging.error("event=crash_recovery_scan_error error=%s", scan_exc)
+
+        try:
+            expired_count = sweep_expired_commands()
+            if expired_count:
+                logging.info("event=command_expiry_sweep expired=%s", expired_count)
+        except Exception as sweep_exc:
+            logging.error("event=command_expiry_sweep_error error=%s", sweep_exc)
+
        time.sleep(POLL_INTERVAL)