feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep

- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat)
- Add restart_app command action with same lifecycle + lockout as reboot_host
- Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish)
- Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands)
- Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit
- Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at
- DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients
- Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed
- Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client
- Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action)
- Frontend: MQTT reconnect count + last disconnect in client detail panel
- MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false
- Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle
- Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated
- Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
2026-04-05 10:17:56 +00:00
parent 4d652f0554
commit 03e3c11e90
35 changed files with 2511 additions and 80 deletions

View File

@@ -8,12 +8,28 @@ from .db_utils import (
compute_group_power_intent_basis,
build_group_power_intent_body,
compute_group_power_intent_fingerprint,
get_crash_recovery_candidates,
issue_crash_recovery_command,
finalize_crash_recovery_command,
sweep_expired_commands,
)
import paho.mqtt.client as mqtt
import json
import datetime
import time
import uuid
import ssl
MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", os.getenv("MQTT_BROKER_URL", "mqtt"))
MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883")))
MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME")
MQTT_PASSWORD = os.getenv("MQTT_PASSWORD")
MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on")
MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT")
MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE")
MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE")
MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on")
def _to_utc_z(dt: datetime.datetime) -> str:
@@ -224,6 +240,19 @@ def main():
client = mqtt.Client(callback_api_version=mqtt.CallbackAPIVersion.VERSION2)
client.reconnect_delay_set(min_delay=1, max_delay=30)
if MQTT_USERNAME and MQTT_PASSWORD:
client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD)
if MQTT_TLS_ENABLED:
client.tls_set(
ca_certs=MQTT_TLS_CA_CERT,
certfile=MQTT_TLS_CERTFILE,
keyfile=MQTT_TLS_KEYFILE,
cert_reqs=ssl.CERT_REQUIRED,
)
if MQTT_TLS_INSECURE:
client.tls_insecure_set(True)
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL_SECONDS", "30"))
# 0 = aus; z.B. 600 für alle 10 Min
# initial value from DB or fallback to env
@@ -238,16 +267,21 @@ def main():
POWER_INTENT_HEARTBEAT_ENABLED = _env_bool("POWER_INTENT_HEARTBEAT_ENABLED", True)
POWER_INTENT_EXPIRY_MULTIPLIER = int(os.getenv("POWER_INTENT_EXPIRY_MULTIPLIER", "3"))
POWER_INTENT_MIN_EXPIRY_SECONDS = int(os.getenv("POWER_INTENT_MIN_EXPIRY_SECONDS", "90"))
CRASH_RECOVERY_ENABLED = _env_bool("CRASH_RECOVERY_ENABLED", False)
CRASH_RECOVERY_GRACE_SECONDS = int(os.getenv("CRASH_RECOVERY_GRACE_SECONDS", "180"))
logging.info(
"Scheduler config: poll_interval=%ss refresh_seconds=%s power_intent_enabled=%s "
"power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss",
"power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss "
"crash_recovery_enabled=%s crash_recovery_grace=%ss",
POLL_INTERVAL,
REFRESH_SECONDS,
POWER_INTENT_PUBLISH_ENABLED,
POWER_INTENT_HEARTBEAT_ENABLED,
POWER_INTENT_EXPIRY_MULTIPLIER,
POWER_INTENT_MIN_EXPIRY_SECONDS,
CRASH_RECOVERY_ENABLED,
CRASH_RECOVERY_GRACE_SECONDS,
)
# Konfigurierbares Zeitfenster in Tagen (Standard: 7)
WINDOW_DAYS = int(os.getenv("EVENTS_WINDOW_DAYS", "7"))
@@ -275,8 +309,15 @@ def main():
client.on_connect = on_connect
client.connect("mqtt", 1883)
client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT)
client.loop_start()
logging.info(
"MQTT connection configured host=%s port=%s tls=%s auth=%s",
MQTT_BROKER_HOST,
MQTT_BROKER_PORT,
MQTT_TLS_ENABLED,
bool(MQTT_USERNAME and MQTT_PASSWORD),
)
while True:
now = datetime.datetime.now(datetime.timezone.utc)
@@ -390,6 +431,51 @@ def main():
power_intent_metrics["retained_republish_total"],
)
if CRASH_RECOVERY_ENABLED:
try:
candidates = get_crash_recovery_candidates(CRASH_RECOVERY_GRACE_SECONDS)
if candidates:
logging.info("event=crash_recovery_scan candidates=%s", len(candidates))
for candidate in candidates:
cuuid = candidate["uuid"]
reason = candidate["reason"]
try:
command_id, payload, topic, compat_topic = issue_crash_recovery_command(
client_uuid=cuuid,
reason=reason,
)
result = client.publish(topic, json.dumps(payload), qos=1, retain=False)
result.wait_for_publish(timeout=5.0)
compat_result = client.publish(compat_topic, json.dumps(payload), qos=1, retain=False)
compat_result.wait_for_publish(timeout=5.0)
success = result.rc == mqtt.MQTT_ERR_SUCCESS
error = None if success else mqtt.error_string(result.rc)
finalize_crash_recovery_command(command_id, published=success, error=error)
if success:
logging.info(
"event=crash_recovery_command_issued client_uuid=%s reason=%s command_id=%s",
cuuid, reason, command_id,
)
else:
logging.error(
"event=crash_recovery_publish_failed client_uuid=%s reason=%s command_id=%s error=%s",
cuuid, reason, command_id, error,
)
except Exception as cmd_exc:
logging.error(
"event=crash_recovery_command_error client_uuid=%s reason=%s error=%s",
cuuid, reason, cmd_exc,
)
except Exception as scan_exc:
logging.error("event=crash_recovery_scan_error error=%s", scan_exc)
try:
expired_count = sweep_expired_commands()
if expired_count:
logging.info("event=command_expiry_sweep expired=%s", expired_count)
except Exception as sweep_exc:
logging.error("event=command_expiry_sweep_error error=%s", sweep_exc)
time.sleep(POLL_INTERVAL)