feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep
- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
from server.database import Session
|
||||
from models.models import Client, ClientGroup
|
||||
from flask import Blueprint, request, jsonify
|
||||
from models.models import Client, ClientGroup, ClientCommand, ProcessStatus
|
||||
from flask import Blueprint, request, jsonify, session as flask_session
|
||||
from server.permissions import admin_or_higher
|
||||
from server.routes.groups import get_grace_period, is_client_alive
|
||||
from server.mqtt_helper import publish_client_group, delete_client_group_message, publish_multiple_client_groups
|
||||
import sys
|
||||
import os
|
||||
@@ -9,13 +10,196 @@ import glob
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
import uuid as uuid_lib
|
||||
from datetime import datetime, timezone, timedelta
|
||||
sys.path.append('/workspace')
|
||||
|
||||
clients_bp = Blueprint("clients", __name__, url_prefix="/api/clients")
|
||||
|
||||
VALID_SCREENSHOT_TYPES = {"periodic", "event_start", "event_stop"}
|
||||
|
||||
COMMAND_SCHEMA_VERSION = "1.0"
|
||||
COMMAND_TOPIC_TEMPLATE = "infoscreen/{uuid}/commands"
|
||||
COMMAND_TOPIC_COMPAT_TEMPLATE = "infoscreen/{uuid}/command"
|
||||
LEGACY_RESTART_TOPIC_TEMPLATE = "clients/{uuid}/restart"
|
||||
COMMAND_EXPIRY_SECONDS = 240
|
||||
REBOOT_LOCKOUT_WINDOW_MINUTES = 15
|
||||
REBOOT_LOCKOUT_THRESHOLD = 3
|
||||
API_ACTION_TO_COMMAND_ACTION = {
|
||||
"restart": "reboot_host",
|
||||
"shutdown": "shutdown_host",
|
||||
"restart_app": "restart_app",
|
||||
}
|
||||
ALLOWED_COMMAND_ACTIONS = set(API_ACTION_TO_COMMAND_ACTION.keys())
|
||||
|
||||
|
||||
def _iso_utc_z(ts: datetime) -> str:
|
||||
return ts.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
|
||||
|
||||
|
||||
def _command_to_dict(command: ClientCommand) -> dict:
|
||||
return {
|
||||
"commandId": command.command_id,
|
||||
"clientUuid": command.client_uuid,
|
||||
"action": command.action,
|
||||
"status": command.status,
|
||||
"reason": command.reason,
|
||||
"requestedBy": command.requested_by,
|
||||
"issuedAt": command.issued_at.isoformat() if command.issued_at else None,
|
||||
"expiresAt": command.expires_at.isoformat() if command.expires_at else None,
|
||||
"publishedAt": command.published_at.isoformat() if command.published_at else None,
|
||||
"ackedAt": command.acked_at.isoformat() if command.acked_at else None,
|
||||
"executionStartedAt": command.execution_started_at.isoformat() if command.execution_started_at else None,
|
||||
"completedAt": command.completed_at.isoformat() if command.completed_at else None,
|
||||
"failedAt": command.failed_at.isoformat() if command.failed_at else None,
|
||||
"errorCode": command.error_code,
|
||||
"errorMessage": command.error_message,
|
||||
"createdAt": command.created_at.isoformat() if command.created_at else None,
|
||||
"updatedAt": command.updated_at.isoformat() if command.updated_at else None,
|
||||
}
|
||||
|
||||
|
||||
def _publish_client_command(client_uuid: str, action: str, payload: dict) -> None:
|
||||
import paho.mqtt.client as mqtt
|
||||
|
||||
broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt")
|
||||
broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883))
|
||||
username = os.getenv("MQTT_USER")
|
||||
password = os.getenv("MQTT_PASSWORD")
|
||||
|
||||
mqtt_client = mqtt.Client()
|
||||
if username and password:
|
||||
mqtt_client.username_pw_set(username, password)
|
||||
|
||||
mqtt_client.connect(broker_host, broker_port)
|
||||
|
||||
# Primary topic for contract-based command handling.
|
||||
command_topic = COMMAND_TOPIC_TEMPLATE.format(uuid=client_uuid)
|
||||
result = mqtt_client.publish(command_topic, json.dumps(payload), qos=1, retain=False)
|
||||
result.wait_for_publish(timeout=5.0)
|
||||
|
||||
# Transitional compatibility for clients that still consume singular topic naming.
|
||||
compat_topic = COMMAND_TOPIC_COMPAT_TEMPLATE.format(uuid=client_uuid)
|
||||
compat_result = mqtt_client.publish(compat_topic, json.dumps(payload), qos=1, retain=False)
|
||||
compat_result.wait_for_publish(timeout=5.0)
|
||||
|
||||
# Transitional compatibility for existing restart-only clients.
|
||||
if action == "restart":
|
||||
legacy_topic = LEGACY_RESTART_TOPIC_TEMPLATE.format(uuid=client_uuid)
|
||||
legacy_payload = {"action": "restart"}
|
||||
legacy_result = mqtt_client.publish(legacy_topic, json.dumps(legacy_payload), qos=1, retain=False)
|
||||
legacy_result.wait_for_publish(timeout=5.0)
|
||||
|
||||
mqtt_client.disconnect()
|
||||
|
||||
|
||||
def _issue_client_command(client_uuid: str, action: str):
|
||||
if action not in ALLOWED_COMMAND_ACTIONS:
|
||||
return jsonify({"error": f"Unsupported action '{action}'"}), 400
|
||||
|
||||
command_action = API_ACTION_TO_COMMAND_ACTION[action]
|
||||
|
||||
data = request.get_json(silent=True) or {}
|
||||
reason = str(data.get("reason", "")).strip() or None
|
||||
requested_by = flask_session.get("user_id")
|
||||
|
||||
now_utc = datetime.now(timezone.utc)
|
||||
expires_at = now_utc + timedelta(seconds=COMMAND_EXPIRY_SECONDS)
|
||||
command_id = str(uuid_lib.uuid4())
|
||||
|
||||
db = Session()
|
||||
try:
|
||||
client = db.query(Client).filter_by(uuid=client_uuid).first()
|
||||
if not client:
|
||||
return jsonify({"error": "Client nicht gefunden"}), 404
|
||||
|
||||
# Safety lockout: avoid rapid repeated reboot loops per client.
|
||||
if command_action in ("reboot_host", "restart_app"):
|
||||
window_start = now_utc - timedelta(minutes=REBOOT_LOCKOUT_WINDOW_MINUTES)
|
||||
recent_reboots = (
|
||||
db.query(ClientCommand)
|
||||
.filter(ClientCommand.client_uuid == client_uuid)
|
||||
.filter(ClientCommand.action.in_(["reboot_host", "restart_app"]))
|
||||
.filter(ClientCommand.created_at >= window_start)
|
||||
.count()
|
||||
)
|
||||
if recent_reboots >= REBOOT_LOCKOUT_THRESHOLD:
|
||||
blocked = ClientCommand(
|
||||
command_id=command_id,
|
||||
client_uuid=client_uuid,
|
||||
action=command_action,
|
||||
status="blocked_safety",
|
||||
reason=reason,
|
||||
requested_by=requested_by,
|
||||
issued_at=now_utc,
|
||||
expires_at=expires_at,
|
||||
failed_at=now_utc,
|
||||
error_code="lockout_threshold",
|
||||
error_message="Reboot lockout active for this client",
|
||||
)
|
||||
db.add(blocked)
|
||||
db.commit()
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": "Neustart voruebergehend blockiert (Sicherheits-Lockout)",
|
||||
"command": _command_to_dict(blocked),
|
||||
}), 429
|
||||
|
||||
command = ClientCommand(
|
||||
command_id=command_id,
|
||||
client_uuid=client_uuid,
|
||||
action=command_action,
|
||||
status="queued",
|
||||
reason=reason,
|
||||
requested_by=requested_by,
|
||||
issued_at=now_utc,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
db.add(command)
|
||||
db.commit()
|
||||
|
||||
command.status = "publish_in_progress"
|
||||
db.commit()
|
||||
|
||||
payload = {
|
||||
"schema_version": COMMAND_SCHEMA_VERSION,
|
||||
"command_id": command.command_id,
|
||||
"client_uuid": command.client_uuid,
|
||||
"action": command.action,
|
||||
"issued_at": _iso_utc_z(command.issued_at),
|
||||
"expires_at": _iso_utc_z(command.expires_at),
|
||||
"requested_by": command.requested_by,
|
||||
"reason": command.reason,
|
||||
}
|
||||
|
||||
try:
|
||||
_publish_client_command(client_uuid=client_uuid, action=action, payload=payload)
|
||||
# ACK can arrive very quickly (including terminal failure) while publish is in-flight.
|
||||
# Refresh to avoid regressing a newer listener-updated state back to "published".
|
||||
db.refresh(command)
|
||||
command.published_at = command.published_at or datetime.now(timezone.utc)
|
||||
if command.status in {"queued", "publish_in_progress"}:
|
||||
command.status = "published"
|
||||
db.commit()
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": f"Command published for client {client_uuid}",
|
||||
"command": _command_to_dict(command),
|
||||
}), 202
|
||||
except Exception as publish_error:
|
||||
command.status = "failed"
|
||||
command.failed_at = datetime.now(timezone.utc)
|
||||
command.error_code = "mqtt_publish_failed"
|
||||
command.error_message = str(publish_error)
|
||||
db.commit()
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"error": f"Failed to publish command: {publish_error}",
|
||||
"command": _command_to_dict(command),
|
||||
}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def _normalize_screenshot_type(raw_type):
|
||||
if raw_type is None:
|
||||
@@ -280,45 +464,148 @@ def get_clients_with_alive_status():
|
||||
"ip": c.ip,
|
||||
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
|
||||
"is_active": c.is_active,
|
||||
"is_alive": bool(c.last_alive and c.is_active),
|
||||
"is_alive": is_client_alive(c.last_alive, c.is_active),
|
||||
})
|
||||
session.close()
|
||||
return jsonify(result)
|
||||
|
||||
|
||||
@clients_bp.route("/crashed", methods=["GET"])
|
||||
@admin_or_higher
|
||||
def get_crashed_clients():
|
||||
"""Returns clients that are crashed (process_status=crashed) or heartbeat-stale."""
|
||||
session = Session()
|
||||
try:
|
||||
from datetime import timedelta
|
||||
grace = get_grace_period()
|
||||
from datetime import datetime, timezone
|
||||
stale_cutoff = datetime.now(timezone.utc) - timedelta(seconds=grace)
|
||||
clients = (
|
||||
session.query(Client)
|
||||
.filter(Client.is_active == True)
|
||||
.all()
|
||||
)
|
||||
result = []
|
||||
for c in clients:
|
||||
alive = is_client_alive(c.last_alive, c.is_active)
|
||||
crashed = c.process_status == ProcessStatus.crashed
|
||||
if not alive or crashed:
|
||||
result.append({
|
||||
"uuid": c.uuid,
|
||||
"description": c.description,
|
||||
"hostname": c.hostname,
|
||||
"ip": c.ip,
|
||||
"group_id": c.group_id,
|
||||
"is_alive": alive,
|
||||
"process_status": c.process_status.value if c.process_status else None,
|
||||
"screen_health_status": c.screen_health_status.value if c.screen_health_status else None,
|
||||
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
|
||||
"crash_reason": "process_crashed" if crashed else "heartbeat_stale",
|
||||
})
|
||||
return jsonify({
|
||||
"crashed_count": len(result),
|
||||
"grace_period_seconds": grace,
|
||||
"clients": result,
|
||||
})
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@clients_bp.route("/service_failed", methods=["GET"])
|
||||
@admin_or_higher
|
||||
def get_service_failed_clients():
|
||||
"""Returns clients that have a service_failed_at set (systemd gave up restarting)."""
|
||||
session = Session()
|
||||
try:
|
||||
clients = (
|
||||
session.query(Client)
|
||||
.filter(Client.service_failed_at.isnot(None))
|
||||
.order_by(Client.service_failed_at.desc())
|
||||
.all()
|
||||
)
|
||||
result = [
|
||||
{
|
||||
"uuid": c.uuid,
|
||||
"description": c.description,
|
||||
"hostname": c.hostname,
|
||||
"ip": c.ip,
|
||||
"group_id": c.group_id,
|
||||
"service_failed_at": c.service_failed_at.isoformat() if c.service_failed_at else None,
|
||||
"service_failed_unit": c.service_failed_unit,
|
||||
"is_alive": is_client_alive(c.last_alive, c.is_active),
|
||||
"last_alive": c.last_alive.isoformat() if c.last_alive else None,
|
||||
}
|
||||
for c in clients
|
||||
]
|
||||
return jsonify({"service_failed_count": len(result), "clients": result})
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
@clients_bp.route("/<client_uuid>/clear_service_failed", methods=["POST"])
|
||||
@admin_or_higher
|
||||
def clear_service_failed(client_uuid):
|
||||
"""Clears the service_failed flag for a client and deletes the retained MQTT message."""
|
||||
import paho.mqtt.client as mqtt_lib
|
||||
|
||||
session = Session()
|
||||
try:
|
||||
c = session.query(Client).filter_by(uuid=client_uuid).first()
|
||||
if not c:
|
||||
return jsonify({"error": "Client nicht gefunden"}), 404
|
||||
if c.service_failed_at is None:
|
||||
return jsonify({"success": True, "message": "Kein service_failed Flag gesetzt."}), 200
|
||||
|
||||
c.service_failed_at = None
|
||||
c.service_failed_unit = None
|
||||
session.commit()
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
# Clear the retained MQTT message (publish empty payload, retained=True)
|
||||
try:
|
||||
broker_host = os.getenv("MQTT_BROKER_HOST", "mqtt")
|
||||
broker_port = int(os.getenv("MQTT_BROKER_PORT", 1883))
|
||||
username = os.getenv("MQTT_USER")
|
||||
password = os.getenv("MQTT_PASSWORD")
|
||||
mc = mqtt_lib.Client()
|
||||
if username and password:
|
||||
mc.username_pw_set(username, password)
|
||||
mc.connect(broker_host, broker_port)
|
||||
topic = f"infoscreen/{client_uuid}/service_failed"
|
||||
mc.publish(topic, payload=None, qos=1, retain=True)
|
||||
mc.disconnect()
|
||||
except Exception as e:
|
||||
# Log but don't fail — DB is already cleared
|
||||
import logging
|
||||
logging.warning(f"Could not clear retained service_failed MQTT message for {client_uuid}: {e}")
|
||||
|
||||
return jsonify({"success": True, "message": "service_failed Flag gelöscht."})
|
||||
|
||||
|
||||
@clients_bp.route("/<uuid>/restart", methods=["POST"])
|
||||
@admin_or_higher
|
||||
def restart_client(uuid):
|
||||
"""
|
||||
Route to restart a specific client by UUID.
|
||||
Sends an MQTT message to the broker to trigger the restart.
|
||||
"""
|
||||
import paho.mqtt.client as mqtt
|
||||
import json
|
||||
return _issue_client_command(client_uuid=uuid, action="restart")
|
||||
|
||||
# MQTT broker configuration
|
||||
MQTT_BROKER = "mqtt"
|
||||
MQTT_PORT = 1883
|
||||
MQTT_TOPIC = f"clients/{uuid}/restart"
|
||||
|
||||
# Connect to the database to check if the client exists
|
||||
session = Session()
|
||||
client = session.query(Client).filter_by(uuid=uuid).first()
|
||||
if not client:
|
||||
session.close()
|
||||
return jsonify({"error": "Client nicht gefunden"}), 404
|
||||
session.close()
|
||||
@clients_bp.route("/<uuid>/shutdown", methods=["POST"])
|
||||
@admin_or_higher
|
||||
def shutdown_client(uuid):
|
||||
return _issue_client_command(client_uuid=uuid, action="shutdown")
|
||||
|
||||
# Send MQTT message
|
||||
|
||||
@clients_bp.route("/commands/<command_id>", methods=["GET"])
|
||||
@admin_or_higher
|
||||
def get_client_command_status(command_id):
|
||||
db = Session()
|
||||
try:
|
||||
mqtt_client = mqtt.Client()
|
||||
mqtt_client.connect(MQTT_BROKER, MQTT_PORT)
|
||||
payload = {"action": "restart"}
|
||||
mqtt_client.publish(MQTT_TOPIC, json.dumps(payload))
|
||||
mqtt_client.disconnect()
|
||||
return jsonify({"success": True, "message": f"Restart signal sent to client {uuid}"}), 200
|
||||
except Exception as e:
|
||||
return jsonify({"error": f"Failed to send MQTT message: {str(e)}"}), 500
|
||||
command = db.query(ClientCommand).filter_by(command_id=command_id).first()
|
||||
if not command:
|
||||
return jsonify({"error": "Command nicht gefunden"}), 404
|
||||
return jsonify(_command_to_dict(command)), 200
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@clients_bp.route("/<uuid>/screenshot", methods=["POST"])
|
||||
|
||||
Reference in New Issue
Block a user