feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep
- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
@@ -4,11 +4,12 @@ import logging
|
||||
import datetime
|
||||
import base64
|
||||
import re
|
||||
import ssl
|
||||
import requests
|
||||
import paho.mqtt.client as mqtt
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from models.models import Client, ClientLog, LogLevel, ProcessStatus, ScreenHealthStatus
|
||||
from models.models import Client, ClientLog, ClientCommand, LogLevel, ProcessStatus, ScreenHealthStatus
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||
|
||||
# Load .env only when not already configured by Docker (API_BASE_URL not set by compose means we're outside a container)
|
||||
@@ -32,6 +33,16 @@ Session = sessionmaker(bind=engine)
|
||||
# API configuration
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "http://server:8000")
|
||||
|
||||
MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", "mqtt")
|
||||
MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883")))
|
||||
MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME")
|
||||
MQTT_PASSWORD = os.getenv("MQTT_PASSWORD")
|
||||
MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||
MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT")
|
||||
MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE")
|
||||
MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE")
|
||||
MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||
|
||||
# Dashboard payload migration observability
|
||||
DASHBOARD_METRICS_LOG_EVERY = int(os.getenv("DASHBOARD_METRICS_LOG_EVERY", "5"))
|
||||
DASHBOARD_PARSE_METRICS = {
|
||||
@@ -376,8 +387,11 @@ def on_connect(client, userdata, flags, reasonCode, properties):
|
||||
client.subscribe("infoscreen/+/logs/warn")
|
||||
client.subscribe("infoscreen/+/logs/info")
|
||||
client.subscribe("infoscreen/+/health")
|
||||
client.subscribe("infoscreen/+/commands/ack")
|
||||
client.subscribe("infoscreen/+/command/ack")
|
||||
client.subscribe("infoscreen/+/service_failed")
|
||||
|
||||
logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, and health")
|
||||
logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, health, and service_failed")
|
||||
except Exception as e:
|
||||
logging.error(f"Subscribe failed on connect: {e}")
|
||||
|
||||
@@ -387,6 +401,72 @@ def on_message(client, userdata, msg):
|
||||
logging.debug(f"Empfangene Nachricht auf Topic: {topic}")
|
||||
|
||||
try:
|
||||
# Command acknowledgement handling
|
||||
if topic.startswith("infoscreen/") and (topic.endswith("/commands/ack") or topic.endswith("/command/ack")):
|
||||
uuid = topic.split("/")[1]
|
||||
try:
|
||||
payload = json.loads(msg.payload.decode())
|
||||
except (json.JSONDecodeError, UnicodeDecodeError):
|
||||
logging.error(f"Ungueltiges Command-ACK Payload von {uuid}")
|
||||
return
|
||||
|
||||
command_id = payload.get("command_id")
|
||||
ack_status = str(payload.get("status", "")).strip().lower()
|
||||
error_code = payload.get("error_code")
|
||||
error_message = payload.get("error_message")
|
||||
|
||||
if not command_id:
|
||||
logging.warning(f"Command-ACK ohne command_id von {uuid}")
|
||||
return
|
||||
|
||||
status_map = {
|
||||
"accepted": "ack_received",
|
||||
"execution_started": "execution_started",
|
||||
"completed": "completed",
|
||||
"failed": "failed",
|
||||
}
|
||||
mapped_status = status_map.get(ack_status)
|
||||
if not mapped_status:
|
||||
logging.warning(f"Unbekannter Command-ACK Status '{ack_status}' von {uuid}")
|
||||
return
|
||||
|
||||
db_session = Session()
|
||||
try:
|
||||
command_obj = db_session.query(ClientCommand).filter_by(command_id=command_id).first()
|
||||
if not command_obj:
|
||||
logging.warning(f"Command-ACK fuer unbekanntes command_id={command_id} von {uuid}")
|
||||
return
|
||||
|
||||
# Ignore stale/duplicate regressions.
|
||||
terminal_states = {"completed", "failed", "expired", "canceled", "blocked_safety"}
|
||||
if command_obj.status in terminal_states:
|
||||
logging.info(
|
||||
f"Command-ACK ignoriert (bereits terminal): command_id={command_id}, status={command_obj.status}"
|
||||
)
|
||||
return
|
||||
|
||||
now_utc = datetime.datetime.now(datetime.UTC)
|
||||
command_obj.status = mapped_status
|
||||
if mapped_status == "ack_received":
|
||||
command_obj.acked_at = now_utc
|
||||
elif mapped_status == "execution_started":
|
||||
command_obj.execution_started_at = now_utc
|
||||
elif mapped_status == "completed":
|
||||
command_obj.completed_at = now_utc
|
||||
elif mapped_status == "failed":
|
||||
command_obj.failed_at = now_utc
|
||||
command_obj.error_code = str(error_code) if error_code is not None else command_obj.error_code
|
||||
command_obj.error_message = str(error_message) if error_message is not None else command_obj.error_message
|
||||
|
||||
db_session.commit()
|
||||
logging.info(f"Command-ACK verarbeitet: command_id={command_id}, status={mapped_status}, uuid={uuid}")
|
||||
except Exception as e:
|
||||
db_session.rollback()
|
||||
logging.error(f"Fehler bei Command-ACK Verarbeitung ({command_id}): {e}")
|
||||
finally:
|
||||
db_session.close()
|
||||
return
|
||||
|
||||
# Dashboard-Handling (nested screenshot payload)
|
||||
if topic.startswith("infoscreen/") and topic.endswith("/dashboard"):
|
||||
uuid = topic.split("/")[1]
|
||||
@@ -506,6 +586,43 @@ def on_message(client, userdata, msg):
|
||||
logging.error(f"Could not parse log payload from {uuid}: {e}")
|
||||
return
|
||||
|
||||
# Service-failed handling (systemd gave up restarting — retained message)
|
||||
if topic.startswith("infoscreen/") and topic.endswith("/service_failed"):
|
||||
uuid = topic.split("/")[1]
|
||||
# Empty payload = retained message cleared; ignore it.
|
||||
if not msg.payload:
|
||||
logging.info(f"service_failed retained message cleared for {uuid}")
|
||||
return
|
||||
try:
|
||||
payload_data = json.loads(msg.payload.decode())
|
||||
failed_at_str = payload_data.get("failed_at")
|
||||
unit = payload_data.get("unit", "")
|
||||
try:
|
||||
failed_at = datetime.datetime.fromisoformat(failed_at_str.replace("Z", "+00:00")) if failed_at_str else datetime.datetime.now(datetime.UTC)
|
||||
if failed_at.tzinfo is None:
|
||||
failed_at = failed_at.replace(tzinfo=datetime.UTC)
|
||||
except (ValueError, AttributeError):
|
||||
failed_at = datetime.datetime.now(datetime.UTC)
|
||||
|
||||
session = Session()
|
||||
try:
|
||||
client_obj = session.query(Client).filter_by(uuid=uuid).first()
|
||||
if client_obj:
|
||||
client_obj.service_failed_at = failed_at
|
||||
client_obj.service_failed_unit = unit[:128] if unit else None
|
||||
session.commit()
|
||||
logging.warning(f"event=service_failed uuid={uuid} unit={unit} failed_at={failed_at.isoformat()}")
|
||||
else:
|
||||
logging.warning(f"service_failed received for unknown client uuid={uuid}")
|
||||
except Exception as e:
|
||||
session.rollback()
|
||||
logging.error(f"Error persisting service_failed for {uuid}: {e}")
|
||||
finally:
|
||||
session.close()
|
||||
except (json.JSONDecodeError, UnicodeDecodeError) as e:
|
||||
logging.error(f"Could not parse service_failed payload from {uuid}: {e}")
|
||||
return
|
||||
|
||||
# Health-Handling
|
||||
if topic.startswith("infoscreen/") and topic.endswith("/health"):
|
||||
uuid = topic.split("/")[1]
|
||||
@@ -531,6 +648,26 @@ def on_message(client, userdata, msg):
|
||||
screen_health_status=screen_health_status,
|
||||
last_screenshot_analyzed=parse_timestamp((payload_data.get('health_metrics') or {}).get('last_frame_update')),
|
||||
)
|
||||
|
||||
# Update broker connection health fields
|
||||
broker_conn = payload_data.get('broker_connection')
|
||||
if isinstance(broker_conn, dict):
|
||||
reconnect_count = broker_conn.get('reconnect_count')
|
||||
last_disconnect_str = broker_conn.get('last_disconnect_at')
|
||||
if reconnect_count is not None:
|
||||
try:
|
||||
client_obj.mqtt_reconnect_count = int(reconnect_count)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
if last_disconnect_str:
|
||||
try:
|
||||
last_disconnect = datetime.datetime.fromisoformat(last_disconnect_str.replace('Z', '+00:00'))
|
||||
if last_disconnect.tzinfo is None:
|
||||
last_disconnect = last_disconnect.replace(tzinfo=datetime.UTC)
|
||||
client_obj.mqtt_last_disconnect_at = last_disconnect
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
|
||||
session.commit()
|
||||
logging.debug(f"Health update from {uuid}: {actual.get('process')} ({actual.get('status')})")
|
||||
session.close()
|
||||
@@ -589,9 +726,29 @@ def main():
|
||||
mqtt_client.on_connect = on_connect
|
||||
# Set an exponential reconnect delay to survive broker restarts
|
||||
mqtt_client.reconnect_delay_set(min_delay=1, max_delay=60)
|
||||
mqtt_client.connect("mqtt", 1883)
|
||||
|
||||
logging.info("Listener gestartet; warte auf MQTT-Verbindung und Nachrichten")
|
||||
if MQTT_USERNAME and MQTT_PASSWORD:
|
||||
mqtt_client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD)
|
||||
|
||||
if MQTT_TLS_ENABLED:
|
||||
mqtt_client.tls_set(
|
||||
ca_certs=MQTT_TLS_CA_CERT,
|
||||
certfile=MQTT_TLS_CERTFILE,
|
||||
keyfile=MQTT_TLS_KEYFILE,
|
||||
cert_reqs=ssl.CERT_REQUIRED,
|
||||
)
|
||||
if MQTT_TLS_INSECURE:
|
||||
mqtt_client.tls_insecure_set(True)
|
||||
|
||||
mqtt_client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT)
|
||||
|
||||
logging.info(
|
||||
"Listener gestartet; warte auf MQTT-Verbindung und Nachrichten (host=%s port=%s tls=%s auth=%s)",
|
||||
MQTT_BROKER_HOST,
|
||||
MQTT_BROKER_PORT,
|
||||
MQTT_TLS_ENABLED,
|
||||
bool(MQTT_USERNAME and MQTT_PASSWORD),
|
||||
)
|
||||
mqtt_client.loop_forever()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user