feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep
- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
@@ -1,13 +1,14 @@
|
||||
# scheduler/db_utils.py
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
from sqlalchemy.orm import sessionmaker, joinedload
|
||||
from sqlalchemy import create_engine, or_, and_, text
|
||||
from models.models import Event, EventMedia, EventException, SystemSetting
|
||||
import uuid as _uuid_mod
|
||||
from models.models import Event, EventMedia, EventException, SystemSetting, Client, ClientCommand, ProcessStatus
|
||||
from dateutil.rrule import rrulestr
|
||||
from urllib.request import Request, urlopen
|
||||
from datetime import timezone
|
||||
@@ -454,3 +455,167 @@ def format_event_with_media(event):
|
||||
# Add other event types (message, etc.) here as needed...
|
||||
|
||||
return event_dict
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Crash detection / auto-recovery helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CRASH_RECOVERY_SCHEMA_VERSION = "1.0"
|
||||
_CRASH_COMMAND_TOPIC = "infoscreen/{uuid}/commands"
|
||||
_CRASH_COMMAND_COMPAT_TOPIC = "infoscreen/{uuid}/command"
|
||||
_CRASH_RECOVERY_EXPIRY_SECONDS = int(os.getenv("CRASH_RECOVERY_COMMAND_EXPIRY_SECONDS", "240"))
|
||||
_CRASH_RECOVERY_LOCKOUT_MINUTES = int(os.getenv("CRASH_RECOVERY_LOCKOUT_MINUTES", "15"))
|
||||
|
||||
|
||||
def get_crash_recovery_candidates(heartbeat_grace_seconds: int) -> list:
|
||||
"""
|
||||
Returns a list of dicts for active clients that are crashed (process_status=crashed)
|
||||
or heartbeat-stale, and don't already have a recent recovery command in the lockout window.
|
||||
"""
|
||||
session = Session()
|
||||
try:
|
||||
now = datetime.now(timezone.utc)
|
||||
stale_cutoff = now - timedelta(seconds=heartbeat_grace_seconds)
|
||||
lockout_cutoff = now - timedelta(minutes=_CRASH_RECOVERY_LOCKOUT_MINUTES)
|
||||
|
||||
candidates = (
|
||||
session.query(Client)
|
||||
.filter(Client.is_active == True)
|
||||
.filter(
|
||||
or_(
|
||||
Client.process_status == ProcessStatus.crashed,
|
||||
Client.last_alive < stale_cutoff,
|
||||
)
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
result = []
|
||||
for c in candidates:
|
||||
recent = (
|
||||
session.query(ClientCommand)
|
||||
.filter(ClientCommand.client_uuid == c.uuid)
|
||||
.filter(ClientCommand.created_at >= lockout_cutoff)
|
||||
.filter(ClientCommand.action.in_(["reboot_host", "restart_app"]))
|
||||
.first()
|
||||
)
|
||||
if recent:
|
||||
continue
|
||||
crash_reason = (
|
||||
"process_crashed"
|
||||
if c.process_status == ProcessStatus.crashed
|
||||
else "heartbeat_stale"
|
||||
)
|
||||
result.append({
|
||||
"uuid": c.uuid,
|
||||
"reason": crash_reason,
|
||||
"process_status": c.process_status.value if c.process_status else None,
|
||||
"last_alive": c.last_alive,
|
||||
})
|
||||
return result
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def issue_crash_recovery_command(client_uuid: str, reason: str) -> tuple:
|
||||
"""
|
||||
Writes a ClientCommand (reboot_host) for crash recovery to the DB.
|
||||
Returns (command_id, payload_dict) for the caller to publish over MQTT.
|
||||
Also returns the MQTT topic strings.
|
||||
"""
|
||||
session = Session()
|
||||
try:
|
||||
now = datetime.now(timezone.utc)
|
||||
expires_at = now + timedelta(seconds=_CRASH_RECOVERY_EXPIRY_SECONDS)
|
||||
command_id = str(_uuid_mod.uuid4())
|
||||
|
||||
command = ClientCommand(
|
||||
command_id=command_id,
|
||||
client_uuid=client_uuid,
|
||||
action="reboot_host",
|
||||
status="queued",
|
||||
reason=reason,
|
||||
requested_by=None,
|
||||
issued_at=now,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
session.add(command)
|
||||
session.commit()
|
||||
command.status = "publish_in_progress"
|
||||
session.commit()
|
||||
|
||||
payload = {
|
||||
"schema_version": _CRASH_RECOVERY_SCHEMA_VERSION,
|
||||
"command_id": command_id,
|
||||
"client_uuid": client_uuid,
|
||||
"action": "reboot_host",
|
||||
"issued_at": now.isoformat().replace("+00:00", "Z"),
|
||||
"expires_at": expires_at.isoformat().replace("+00:00", "Z"),
|
||||
"requested_by": None,
|
||||
"reason": reason,
|
||||
}
|
||||
topic = _CRASH_COMMAND_TOPIC.format(uuid=client_uuid)
|
||||
compat_topic = _CRASH_COMMAND_COMPAT_TOPIC.format(uuid=client_uuid)
|
||||
return command_id, payload, topic, compat_topic
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
def finalize_crash_recovery_command(command_id: str, published: bool, error: str = None) -> None:
|
||||
"""Updates command status after MQTT publish attempt."""
|
||||
session = Session()
|
||||
try:
|
||||
cmd = session.query(ClientCommand).filter_by(command_id=command_id).first()
|
||||
if not cmd:
|
||||
return
|
||||
now = datetime.now(timezone.utc)
|
||||
if published:
|
||||
cmd.status = "published"
|
||||
cmd.published_at = now
|
||||
else:
|
||||
cmd.status = "failed"
|
||||
cmd.failed_at = now
|
||||
cmd.error_code = "mqtt_publish_failed"
|
||||
cmd.error_message = error or "Unknown publish error"
|
||||
session.commit()
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
_TERMINAL_COMMAND_STATUSES = {"completed", "failed", "expired", "canceled", "blocked_safety"}
|
||||
|
||||
|
||||
def sweep_expired_commands() -> int:
|
||||
"""Marks non-terminal commands whose expires_at has passed as expired.
|
||||
|
||||
Returns the number of commands updated.
|
||||
"""
|
||||
session = Session()
|
||||
try:
|
||||
now = datetime.now(timezone.utc)
|
||||
commands = (
|
||||
session.query(ClientCommand)
|
||||
.filter(
|
||||
ClientCommand.expires_at < now,
|
||||
ClientCommand.status.notin_(_TERMINAL_COMMAND_STATUSES),
|
||||
)
|
||||
.all()
|
||||
)
|
||||
if not commands:
|
||||
return 0
|
||||
for cmd in commands:
|
||||
cmd.status = "expired"
|
||||
cmd.failed_at = now
|
||||
cmd.error_code = "expired"
|
||||
cmd.error_message = "Command expired before terminal state was reached."
|
||||
session.commit()
|
||||
return len(commands)
|
||||
except Exception:
|
||||
session.rollback()
|
||||
raise
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
Reference in New Issue
Block a user