feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep

- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat)
- Add restart_app command action with same lifecycle + lockout as reboot_host
- Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish)
- Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands)
- Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit
- Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at
- DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients
- Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed
- Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client
- Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action)
- Frontend: MQTT reconnect count + last disconnect in client detail panel
- MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false
- Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle
- Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated
- Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
2026-04-05 10:17:56 +00:00
parent 4d652f0554
commit 03e3c11e90
35 changed files with 2511 additions and 80 deletions

View File

@@ -1,13 +1,14 @@
# scheduler/db_utils.py
from dotenv import load_dotenv
import os
from datetime import datetime
from datetime import datetime, timedelta, timezone
import hashlib
import json
import logging
from sqlalchemy.orm import sessionmaker, joinedload
from sqlalchemy import create_engine, or_, and_, text
from models.models import Event, EventMedia, EventException, SystemSetting
import uuid as _uuid_mod
from models.models import Event, EventMedia, EventException, SystemSetting, Client, ClientCommand, ProcessStatus
from dateutil.rrule import rrulestr
from urllib.request import Request, urlopen
from datetime import timezone
@@ -454,3 +455,167 @@ def format_event_with_media(event):
# Add other event types (message, etc.) here as needed...
return event_dict
# ---------------------------------------------------------------------------
# Crash detection / auto-recovery helpers
# ---------------------------------------------------------------------------
_CRASH_RECOVERY_SCHEMA_VERSION = "1.0"
_CRASH_COMMAND_TOPIC = "infoscreen/{uuid}/commands"
_CRASH_COMMAND_COMPAT_TOPIC = "infoscreen/{uuid}/command"
_CRASH_RECOVERY_EXPIRY_SECONDS = int(os.getenv("CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS", "240"))
_CRASH_RECOVERY_LOCKOUT_MINUTES = int(os.getenv("CRASH_RECOVERY_LOCKOUT_MINUTES", "15"))
def get_crash_recovery_candidates(heartbeat_grace_seconds: int) -> list:
"""
Returns a list of dicts for active clients that are crashed (process_status=crashed)
or heartbeat-stale, and don't already have a recent recovery command in the lockout window.
"""
session = Session()
try:
now = datetime.now(timezone.utc)
stale_cutoff = now - timedelta(seconds=heartbeat_grace_seconds)
lockout_cutoff = now - timedelta(minutes=_CRASH_RECOVERY_LOCKOUT_MINUTES)
candidates = (
session.query(Client)
.filter(Client.is_active == True)
.filter(
or_(
Client.process_status == ProcessStatus.crashed,
Client.last_alive < stale_cutoff,
)
)
.all()
)
result = []
for c in candidates:
recent = (
session.query(ClientCommand)
.filter(ClientCommand.client_uuid == c.uuid)
.filter(ClientCommand.created_at >= lockout_cutoff)
.filter(ClientCommand.action.in_(["reboot_host", "restart_app"]))
.first()
)
if recent:
continue
crash_reason = (
"process_crashed"
if c.process_status == ProcessStatus.crashed
else "heartbeat_stale"
)
result.append({
"uuid": c.uuid,
"reason": crash_reason,
"process_status": c.process_status.value if c.process_status else None,
"last_alive": c.last_alive,
})
return result
finally:
session.close()
def issue_crash_recovery_command(client_uuid: str, reason: str) -> tuple:
"""
Writes a ClientCommand (reboot_host) for crash recovery to the DB.
Returns (command_id, payload_dict) for the caller to publish over MQTT.
Also returns the MQTT topic strings.
"""
session = Session()
try:
now = datetime.now(timezone.utc)
expires_at = now + timedelta(seconds=_CRASH_RECOVERY_EXPIRY_SECONDS)
command_id = str(_uuid_mod.uuid4())
command = ClientCommand(
command_id=command_id,
client_uuid=client_uuid,
action="reboot_host",
status="queued",
reason=reason,
requested_by=None,
issued_at=now,
expires_at=expires_at,
)
session.add(command)
session.commit()
command.status = "publish_in_progress"
session.commit()
payload = {
"schema_version": _CRASH_RECOVERY_SCHEMA_VERSION,
"command_id": command_id,
"client_uuid": client_uuid,
"action": "reboot_host",
"issued_at": now.isoformat().replace("+00:00", "Z"),
"expires_at": expires_at.isoformat().replace("+00:00", "Z"),
"requested_by": None,
"reason": reason,
}
topic = _CRASH_COMMAND_TOPIC.format(uuid=client_uuid)
compat_topic = _CRASH_COMMAND_COMPAT_TOPIC.format(uuid=client_uuid)
return command_id, payload, topic, compat_topic
except Exception:
session.rollback()
raise
finally:
session.close()
def finalize_crash_recovery_command(command_id: str, published: bool, error: str = None) -> None:
"""Updates command status after MQTT publish attempt."""
session = Session()
try:
cmd = session.query(ClientCommand).filter_by(command_id=command_id).first()
if not cmd:
return
now = datetime.now(timezone.utc)
if published:
cmd.status = "published"
cmd.published_at = now
else:
cmd.status = "failed"
cmd.failed_at = now
cmd.error_code = "mqtt_publish_failed"
cmd.error_message = error or "Unknown publish error"
session.commit()
finally:
session.close()
_TERMINAL_COMMAND_STATUSES = {"completed", "failed", "expired", "canceled", "blocked_safety"}
def sweep_expired_commands() -> int:
"""Marks non-terminal commands whose expires_at has passed as expired.
Returns the number of commands updated.
"""
session = Session()
try:
now = datetime.now(timezone.utc)
commands = (
session.query(ClientCommand)
.filter(
ClientCommand.expires_at < now,
ClientCommand.status.notin_(_TERMINAL_COMMAND_STATUSES),
)
.all()
)
if not commands:
return 0
for cmd in commands:
cmd.status = "expired"
cmd.failed_at = now
cmd.error_code = "expired"
cmd.error_message = "Command expired before terminal state was reached."
session.commit()
return len(commands)
except Exception:
session.rollback()
raise
finally:
session.close()