feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep

- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat)
- Add restart_app command action with same lifecycle + lockout as reboot_host
- Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish)
- Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands)
- Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit
- Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at
- DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients
- Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed
- Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client
- Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action)
- Frontend: MQTT reconnect count + last disconnect in client detail panel
- MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false
- Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle
- Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated
- Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
2026-04-05 10:17:56 +00:00
parent 4d652f0554
commit 03e3c11e90
35 changed files with 2511 additions and 80 deletions

View File

@@ -4,11 +4,12 @@ import logging
import datetime
import base64
import re
import ssl
import requests
import paho.mqtt.client as mqtt
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from models.models import Client, ClientLog, LogLevel, ProcessStatus, ScreenHealthStatus
from models.models import Client, ClientLog, ClientCommand, LogLevel, ProcessStatus, ScreenHealthStatus
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s')
# Load .env only when not already configured by Docker (API_BASE_URL not set by compose means we're outside a container)
@@ -32,6 +33,16 @@ Session = sessionmaker(bind=engine)
# API configuration
API_BASE_URL = os.getenv("API_BASE_URL", "http://server:8000")
MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", "mqtt")
MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883")))
MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME")
MQTT_PASSWORD = os.getenv("MQTT_PASSWORD")
MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on")
MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT")
MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE")
MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE")
MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on")
# Dashboard payload migration observability
DASHBOARD_METRICS_LOG_EVERY = int(os.getenv("DASHBOARD_METRICS_LOG_EVERY", "5"))
DASHBOARD_PARSE_METRICS = {
@@ -376,8 +387,11 @@ def on_connect(client, userdata, flags, reasonCode, properties):
client.subscribe("infoscreen/+/logs/warn")
client.subscribe("infoscreen/+/logs/info")
client.subscribe("infoscreen/+/health")
client.subscribe("infoscreen/+/commands/ack")
client.subscribe("infoscreen/+/command/ack")
client.subscribe("infoscreen/+/service_failed")
logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, and health")
logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, health, and service_failed")
except Exception as e:
logging.error(f"Subscribe failed on connect: {e}")
@@ -387,6 +401,72 @@ def on_message(client, userdata, msg):
logging.debug(f"Empfangene Nachricht auf Topic: {topic}")
try:
# Command acknowledgement handling
if topic.startswith("infoscreen/") and (topic.endswith("/commands/ack") or topic.endswith("/command/ack")):
uuid = topic.split("/")[1]
try:
payload = json.loads(msg.payload.decode())
except (json.JSONDecodeError, UnicodeDecodeError):
logging.error(f"Ungueltiges Command-ACK Payload von {uuid}")
return
command_id = payload.get("command_id")
ack_status = str(payload.get("status", "")).strip().lower()
error_code = payload.get("error_code")
error_message = payload.get("error_message")
if not command_id:
logging.warning(f"Command-ACK ohne command_id von {uuid}")
return
status_map = {
"accepted": "ack_received",
"execution_started": "execution_started",
"completed": "completed",
"failed": "failed",
}
mapped_status = status_map.get(ack_status)
if not mapped_status:
logging.warning(f"Unbekannter Command-ACK Status '{ack_status}' von {uuid}")
return
db_session = Session()
try:
command_obj = db_session.query(ClientCommand).filter_by(command_id=command_id).first()
if not command_obj:
logging.warning(f"Command-ACK fuer unbekanntes command_id={command_id} von {uuid}")
return
# Ignore stale/duplicate regressions.
terminal_states = {"completed", "failed", "expired", "canceled", "blocked_safety"}
if command_obj.status in terminal_states:
logging.info(
f"Command-ACK ignoriert (bereits terminal): command_id={command_id}, status={command_obj.status}"
)
return
now_utc = datetime.datetime.now(datetime.UTC)
command_obj.status = mapped_status
if mapped_status == "ack_received":
command_obj.acked_at = now_utc
elif mapped_status == "execution_started":
command_obj.execution_started_at = now_utc
elif mapped_status == "completed":
command_obj.completed_at = now_utc
elif mapped_status == "failed":
command_obj.failed_at = now_utc
command_obj.error_code = str(error_code) if error_code is not None else command_obj.error_code
command_obj.error_message = str(error_message) if error_message is not None else command_obj.error_message
db_session.commit()
logging.info(f"Command-ACK verarbeitet: command_id={command_id}, status={mapped_status}, uuid={uuid}")
except Exception as e:
db_session.rollback()
logging.error(f"Fehler bei Command-ACK Verarbeitung ({command_id}): {e}")
finally:
db_session.close()
return
# Dashboard-Handling (nested screenshot payload)
if topic.startswith("infoscreen/") and topic.endswith("/dashboard"):
uuid = topic.split("/")[1]
@@ -506,6 +586,43 @@ def on_message(client, userdata, msg):
logging.error(f"Could not parse log payload from {uuid}: {e}")
return
# Service-failed handling (systemd gave up restarting — retained message)
if topic.startswith("infoscreen/") and topic.endswith("/service_failed"):
uuid = topic.split("/")[1]
# Empty payload = retained message cleared; ignore it.
if not msg.payload:
logging.info(f"service_failed retained message cleared for {uuid}")
return
try:
payload_data = json.loads(msg.payload.decode())
failed_at_str = payload_data.get("failed_at")
unit = payload_data.get("unit", "")
try:
failed_at = datetime.datetime.fromisoformat(failed_at_str.replace("Z", "+00:00")) if failed_at_str else datetime.datetime.now(datetime.UTC)
if failed_at.tzinfo is None:
failed_at = failed_at.replace(tzinfo=datetime.UTC)
except (ValueError, AttributeError):
failed_at = datetime.datetime.now(datetime.UTC)
session = Session()
try:
client_obj = session.query(Client).filter_by(uuid=uuid).first()
if client_obj:
client_obj.service_failed_at = failed_at
client_obj.service_failed_unit = unit[:128] if unit else None
session.commit()
logging.warning(f"event=service_failed uuid={uuid} unit={unit} failed_at={failed_at.isoformat()}")
else:
logging.warning(f"service_failed received for unknown client uuid={uuid}")
except Exception as e:
session.rollback()
logging.error(f"Error persisting service_failed for {uuid}: {e}")
finally:
session.close()
except (json.JSONDecodeError, UnicodeDecodeError) as e:
logging.error(f"Could not parse service_failed payload from {uuid}: {e}")
return
# Health-Handling
if topic.startswith("infoscreen/") and topic.endswith("/health"):
uuid = topic.split("/")[1]
@@ -531,6 +648,26 @@ def on_message(client, userdata, msg):
screen_health_status=screen_health_status,
last_screenshot_analyzed=parse_timestamp((payload_data.get('health_metrics') or {}).get('last_frame_update')),
)
# Update broker connection health fields
broker_conn = payload_data.get('broker_connection')
if isinstance(broker_conn, dict):
reconnect_count = broker_conn.get('reconnect_count')
last_disconnect_str = broker_conn.get('last_disconnect_at')
if reconnect_count is not None:
try:
client_obj.mqtt_reconnect_count = int(reconnect_count)
except (ValueError, TypeError):
pass
if last_disconnect_str:
try:
last_disconnect = datetime.datetime.fromisoformat(last_disconnect_str.replace('Z', '+00:00'))
if last_disconnect.tzinfo is None:
last_disconnect = last_disconnect.replace(tzinfo=datetime.UTC)
client_obj.mqtt_last_disconnect_at = last_disconnect
except (ValueError, AttributeError):
pass
session.commit()
logging.debug(f"Health update from {uuid}: {actual.get('process')} ({actual.get('status')})")
session.close()
@@ -589,9 +726,29 @@ def main():
mqtt_client.on_connect = on_connect
# Set an exponential reconnect delay to survive broker restarts
mqtt_client.reconnect_delay_set(min_delay=1, max_delay=60)
mqtt_client.connect("mqtt", 1883)
logging.info("Listener gestartet; warte auf MQTT-Verbindung und Nachrichten")
if MQTT_USERNAME and MQTT_PASSWORD:
mqtt_client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD)
if MQTT_TLS_ENABLED:
mqtt_client.tls_set(
ca_certs=MQTT_TLS_CA_CERT,
certfile=MQTT_TLS_CERTFILE,
keyfile=MQTT_TLS_KEYFILE,
cert_reqs=ssl.CERT_REQUIRED,
)
if MQTT_TLS_INSECURE:
mqtt_client.tls_insecure_set(True)
mqtt_client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT)
logging.info(
"Listener gestartet; warte auf MQTT-Verbindung und Nachrichten (host=%s port=%s tls=%s auth=%s)",
MQTT_BROKER_HOST,
MQTT_BROKER_PORT,
MQTT_TLS_ENABLED,
bool(MQTT_USERNAME and MQTT_PASSWORD),
)
mqtt_client.loop_forever()