feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep

- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat)
- Add restart_app command action with same lifecycle + lockout as reboot_host
- Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish)
- Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands)
- Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit
- Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at
- DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients
- Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed
- Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client
- Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action)
- Frontend: MQTT reconnect count + last disconnect in client detail panel
- MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false
- Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle
- Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated
- Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
2026-04-05 10:17:56 +00:00
parent 4d652f0554
commit 03e3c11e90
35 changed files with 2511 additions and 80 deletions

View File

@@ -1,7 +1,8 @@
from server.database import Session
from models.models import Client, ClientGroup
from flask import Blueprint, request, jsonify
from models.models import Client, ClientGroup, ClientCommand, ProcessStatus
from flask import Blueprint, request, jsonify, session as flask_session
from server.permissions import admin_or_higher
from server.routes.groups import get_grace_period, is_client_alive
from server.mqtt_helper import publish_client_group, delete_client_group_message, publish_multiple_client_groups
import sys
import os
@@ -9,13 +10,196 @@ import glob
import base64
import hashlib
import json
from datetime import datetime, timezone
import uuid as uuid_lib
from datetime import datetime, timezone, timedelta
sys.path.append('/workspace')
clients_bp = Blueprint("clients", __name__, url_prefix="/api/clients")
VALID_SCREENSHOT_TYPES = {"periodic", "event_start", "event_stop"}
COMMAND_SCHEMA_VERSION = "1.0"
COMMAND_TOPIC_TEMPLATE = "infoscreen/{uuid}/commands"
COMMAND_TOPIC_COMPAT_TEMPLATE = "infoscreen/{uuid}/command"
LEGACY_RESTART_TOPIC_TEMPLATE = "clients/{uuid}/restart"
COMMAND_EXPIRY_SECONDS = 240
REBOOT_LOCKOUT_WINDOW_MINUTES = 15
REBOOT_LOCKOUT_THRESHOLD = 3
API_ACTION_TO_COMMAND_ACTION = {
"restart": "reboot_host",
"shutdown": "shutdown_host",
"restart_app": "restart_app",
}
ALLOWED_COMMAND_ACTIONS = set(API_ACTION_TO_COMMAND_ACTION.keys())
def _iso_utc_z(ts: datetime) -> str:
return ts.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
def _command_to_dict(command: ClientCommand) -> dict:
return {
"commandId": command.command_id,
"clientUuid": command.client_uuid,
"action": command.action,
"status": command.status,
"reason": command.reason,
"requestedBy": command.requested_by,
"issuedAt": command.issued_at.isoformat() if command.issued_at else None,
"expiresAt": command.expires_at.isoformat() if command.expires_at else None,
"publishedAt": command.published_at.isoformat() if command.published_at else None,
"ackedAt": command.acked_at.isoformat() if command.acked_at else None,
"executionStartedAt": command.execution_started_at.isoformat() if command.execution_started_at else None,
"completedAt": command.completed_at.isoformat() if command.completed_at else None,
"failedAt": command.failed_at.isoformat() if command.failed_at else None,
"errorCode": command.error_code,
"errorMessage": command.error_message,
"createdAt": command.created_at.isoformat() if command.created_at else None,
"updatedAt": command.updated_at.isoformat() if command.updated_at else None,
}
def _publish_client_command(client_uuid: str, action: str, payload: dict) -> None:
import paho.mqtt.client as mqtt
broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt")
broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883))
username = os.getenv("MQTT_USER")
password = os.getenv("MQTT_PASSWORD")
mqtt_client = mqtt.Client()
if username and password:
mqtt_client.username_pw_set(username, password)
mqtt_client.connect(broker_host, broker_port)
# Primary topic for contract-based command handling.
command_topic = COMMAND_TOPIC_TEMPLATE.format(uuid=client_uuid)
result = mqtt_client.publish(command_topic, json.dumps(payload), qos=1, retain=False)
result.wait_for_publish(timeout=5.0)
# Transitional compatibility for clients that still consume singular topic naming.
compat_topic = COMMAND_TOPIC_COMPAT_TEMPLATE.format(uuid=client_uuid)
compat_result = mqtt_client.publish(compat_topic, json.dumps(payload), qos=1, retain=False)
compat_result.wait_for_publish(timeout=5.0)
# Transitional compatibility for existing restart-only clients.
if action == "restart":
legacy_topic = LEGACY_RESTART_TOPIC_TEMPLATE.format(uuid=client_uuid)
legacy_payload = {"action": "restart"}
legacy_result = mqtt_client.publish(legacy_topic, json.dumps(legacy_payload), qos=1, retain=False)
legacy_result.wait_for_publish(timeout=5.0)
mqtt_client.disconnect()
def _issue_client_command(client_uuid: str, action: str):
if action not in ALLOWED_COMMAND_ACTIONS:
return jsonify({"error": f"Unsupported action '{action}'"}), 400
command_action = API_ACTION_TO_COMMAND_ACTION[action]
data = request.get_json(silent=True) or {}
reason = str(data.get("reason", "")).strip() or None
requested_by = flask_session.get("user_id")
now_utc = datetime.now(timezone.utc)
expires_at = now_utc + timedelta(seconds=COMMAND_EXPIRY_SECONDS)
command_id = str(uuid_lib.uuid4())
db = Session()
try:
client = db.query(Client).filter_by(uuid=client_uuid).first()
if not client:
return jsonify({"error": "Client nicht gefunden"}), 404
# Safety lockout: avoid rapid repeated reboot loops per client.
if command_action in ("reboot_host", "restart_app"):
window_start = now_utc - timedelta(minutes=REBOOT_LOCKOUT_WINDOW_MINUTES)
recent_reboots = (
db.query(ClientCommand)
.filter(ClientCommand.client_uuid == client_uuid)
.filter(ClientCommand.action.in_(["reboot_host", "restart_app"]))
.filter(ClientCommand.created_at >= window_start)
.count()
)
if recent_reboots >= REBOOT_LOCKOUT_THRESHOLD:
blocked = ClientCommand(
command_id=command_id,
client_uuid=client_uuid,
action=command_action,
status="blocked_safety",
reason=reason,
requested_by=requested_by,
issued_at=now_utc,
expires_at=expires_at,
failed_at=now_utc,
error_code="lockout_threshold",
error_message="Reboot lockout active for this client",
)
db.add(blocked)
db.commit()
return jsonify({
"success": False,
"message": "Neustart voruebergehend blockiert (Sicherheits-Lockout)",
"command": _command_to_dict(blocked),
}), 429
command = ClientCommand(
command_id=command_id,
client_uuid=client_uuid,
action=command_action,
status="queued",
reason=reason,
requested_by=requested_by,
issued_at=now_utc,
expires_at=expires_at,
)
db.add(command)
db.commit()
command.status = "publish_in_progress"
db.commit()
payload = {
"schema_version": COMMAND_SCHEMA_VERSION,
"command_id": command.command_id,
"client_uuid": command.client_uuid,
"action": command.action,
"issued_at": _iso_utc_z(command.issued_at),
"expires_at": _iso_utc_z(command.expires_at),
"requested_by": command.requested_by,
"reason": command.reason,
}
try:
_publish_client_command(client_uuid=client_uuid, action=action, payload=payload)
# ACK can arrive very quickly (including terminal failure) while publish is in-flight.
# Refresh to avoid regressing a newer listener-updated state back to "published".
db.refresh(command)
command.published_at = command.published_at or datetime.now(timezone.utc)
if command.status in {"queued", "publish_in_progress"}:
command.status = "published"
db.commit()
return jsonify({
"success": True,
"message": f"Command published for client {client_uuid}",
"command": _command_to_dict(command),
}), 202
except Exception as publish_error:
command.status = "failed"
command.failed_at = datetime.now(timezone.utc)
command.error_code = "mqtt_publish_failed"
command.error_message = str(publish_error)
db.commit()
return jsonify({
"success": False,
"error": f"Failed to publish command: {publish_error}",
"command": _command_to_dict(command),
}), 500
finally:
db.close()
def _normalize_screenshot_type(raw_type):
if raw_type is None:
@@ -280,45 +464,148 @@ def get_clients_with_alive_status():
"ip": c.ip,
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
"is_active": c.is_active,
"is_alive": bool(c.last_alive and c.is_active),
"is_alive": is_client_alive(c.last_alive, c.is_active),
})
session.close()
return jsonify(result)
@clients_bp.route("/crashed", methods=["GET"])
@admin_or_higher
def get_crashed_clients():
"""Returns clients that are crashed (process_status=crashed) or heartbeat-stale."""
session = Session()
try:
from datetime import timedelta
grace = get_grace_period()
from datetime import datetime, timezone
stale_cutoff = datetime.now(timezone.utc) - timedelta(seconds=grace)
clients = (
session.query(Client)
.filter(Client.is_active == True)
.all()
)
result = []
for c in clients:
alive = is_client_alive(c.last_alive, c.is_active)
crashed = c.process_status == ProcessStatus.crashed
if not alive or crashed:
result.append({
"uuid": c.uuid,
"description": c.description,
"hostname": c.hostname,
"ip": c.ip,
"group_id": c.group_id,
"is_alive": alive,
"process_status": c.process_status.value if c.process_status else None,
"screen_health_status": c.screen_health_status.value if c.screen_health_status else None,
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
"crash_reason": "process_crashed" if crashed else "heartbeat_stale",
})
return jsonify({
"crashed_count": len(result),
"grace_period_seconds": grace,
"clients": result,
})
finally:
session.close()
@clients_bp.route("/service_failed", methods=["GET"])
@admin_or_higher
def get_service_failed_clients():
"""Returns clients that have a service_failed_at set (systemd gave up restarting)."""
session = Session()
try:
clients = (
session.query(Client)
.filter(Client.service_failed_at.isnot(None))
.order_by(Client.service_failed_at.desc())
.all()
)
result = [
{
"uuid": c.uuid,
"description": c.description,
"hostname": c.hostname,
"ip": c.ip,
"group_id": c.group_id,
"service_failed_at": c.service_failed_at.isoformat() if c.service_failed_at else None,
"service_failed_unit": c.service_failed_unit,
"is_alive": is_client_alive(c.last_alive, c.is_active),
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
}
for c in clients
]
return jsonify({"service_failed_count": len(result), "clients": result})
finally:
session.close()
@clients_bp.route("/<client_uuid>/clear_service_failed", methods=["POST"])
@admin_or_higher
def clear_service_failed(client_uuid):
"""Clears the service_failed flag for a client and deletes the retained MQTT message."""
import paho.mqtt.client as mqtt_lib
session = Session()
try:
c = session.query(Client).filter_by(uuid=client_uuid).first()
if not c:
return jsonify({"error": "Client nicht gefunden"}), 404
if c.service_failed_at is None:
return jsonify({"success": True, "message": "Kein service_failed Flag gesetzt."}), 200
c.service_failed_at = None
c.service_failed_unit = None
session.commit()
finally:
session.close()
# Clear the retained MQTT message (publish empty payload, retained=True)
try:
broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt")
broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883))
username = os.getenv("MQTT_USER")
password = os.getenv("MQTT_PASSWORD")
mc = mqtt_lib.Client()
if username and password:
mc.username_pw_set(username, password)
mc.connect(broker_host, broker_port)
topic = f"infoscreen/{client_uuid}/service_failed"
mc.publish(topic, payload=None, qos=1, retain=True)
mc.disconnect()
except Exception as e:
# Log but don't fail — DB is already cleared
import logging
logging.warning(f"Could not clear retained service_failed MQTT message for {client_uuid}: {e}")
return jsonify({"success": True, "message": "service_failed Flag gelöscht."})
@clients_bp.route("/<uuid>/restart", methods=["POST"])
@admin_or_higher
def restart_client(uuid):
"""
Route to restart a specific client by UUID.
Sends an MQTT message to the broker to trigger the restart.
"""
import paho.mqtt.client as mqtt
import json
return _issue_client_command(client_uuid=uuid, action="restart")
# MQTT broker configuration
MQTT_BROKER = "mqtt"
MQTT_PORT = 1883
MQTT_TOPIC = f"clients/{uuid}/restart"
# Connect to the database to check if the client exists
session = Session()
client = session.query(Client).filter_by(uuid=uuid).first()
if not client:
session.close()
return jsonify({"error": "Client nicht gefunden"}), 404
session.close()
@clients_bp.route("/<uuid>/shutdown", methods=["POST"])
@admin_or_higher
def shutdown_client(uuid):
return _issue_client_command(client_uuid=uuid, action="shutdown")
# Send MQTT message
@clients_bp.route("/commands/<command_id>", methods=["GET"])
@admin_or_higher
def get_client_command_status(command_id):
db = Session()
try:
mqtt_client = mqtt.Client()
mqtt_client.connect(MQTT_BROKER, MQTT_PORT)
payload = {"action": "restart"}
mqtt_client.publish(MQTT_TOPIC, json.dumps(payload))
mqtt_client.disconnect()
return jsonify({"success": True, "message": f"Restart signal sent to client {uuid}"}), 200
except Exception as e:
return jsonify({"error": f"Failed to send MQTT message: {str(e)}"}), 500
command = db.query(ClientCommand).filter_by(command_id=command_id).first()
if not command:
return jsonify({"error": "Command nicht gefunden"}), 404
return jsonify(_command_to_dict(command)), 200
finally:
db.close()
@clients_bp.route("/<uuid>/screenshot", methods=["POST"])