feat: crash recovery, service_failed monitoring, broker health fields, command expiry sweep
- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
This commit is contained in:
@@ -8,12 +8,28 @@ from .db_utils import (
|
||||
compute_group_power_intent_basis,
|
||||
build_group_power_intent_body,
|
||||
compute_group_power_intent_fingerprint,
|
||||
get_crash_recovery_candidates,
|
||||
issue_crash_recovery_command,
|
||||
finalize_crash_recovery_command,
|
||||
sweep_expired_commands,
|
||||
)
|
||||
import paho.mqtt.client as mqtt
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
import uuid
|
||||
import ssl
|
||||
|
||||
|
||||
MQTT_BROKER_HOST = os.getenv("MQTT_BROKER_HOST", os.getenv("MQTT_BROKER_URL", "mqtt"))
|
||||
MQTT_BROKER_PORT = int(os.getenv("MQTT_BROKER_PORT", os.getenv("MQTT_PORT", "1883")))
|
||||
MQTT_USERNAME = os.getenv("MQTT_USER") or os.getenv("MQTT_USERNAME")
|
||||
MQTT_PASSWORD = os.getenv("MQTT_PASSWORD")
|
||||
MQTT_TLS_ENABLED = os.getenv("MQTT_TLS_ENABLED", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||
MQTT_TLS_CA_CERT = os.getenv("MQTT_TLS_CA_CERT")
|
||||
MQTT_TLS_CERTFILE = os.getenv("MQTT_TLS_CERTFILE")
|
||||
MQTT_TLS_KEYFILE = os.getenv("MQTT_TLS_KEYFILE")
|
||||
MQTT_TLS_INSECURE = os.getenv("MQTT_TLS_INSECURE", "false").strip().lower() in ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
def _to_utc_z(dt: datetime.datetime) -> str:
|
||||
@@ -224,6 +240,19 @@ def main():
|
||||
client = mqtt.Client(callback_api_version=mqtt.CallbackAPIVersion.VERSION2)
|
||||
client.reconnect_delay_set(min_delay=1, max_delay=30)
|
||||
|
||||
if MQTT_USERNAME and MQTT_PASSWORD:
|
||||
client.username_pw_set(MQTT_USERNAME, MQTT_PASSWORD)
|
||||
|
||||
if MQTT_TLS_ENABLED:
|
||||
client.tls_set(
|
||||
ca_certs=MQTT_TLS_CA_CERT,
|
||||
certfile=MQTT_TLS_CERTFILE,
|
||||
keyfile=MQTT_TLS_KEYFILE,
|
||||
cert_reqs=ssl.CERT_REQUIRED,
|
||||
)
|
||||
if MQTT_TLS_INSECURE:
|
||||
client.tls_insecure_set(True)
|
||||
|
||||
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL_SECONDS", "30"))
|
||||
# 0 = aus; z.B. 600 für alle 10 Min
|
||||
# initial value from DB or fallback to env
|
||||
@@ -238,16 +267,21 @@ def main():
|
||||
POWER_INTENT_HEARTBEAT_ENABLED = _env_bool("POWER_INTENT_HEARTBEAT_ENABLED", True)
|
||||
POWER_INTENT_EXPIRY_MULTIPLIER = int(os.getenv("POWER_INTENT_EXPIRY_MULTIPLIER", "3"))
|
||||
POWER_INTENT_MIN_EXPIRY_SECONDS = int(os.getenv("POWER_INTENT_MIN_EXPIRY_SECONDS", "90"))
|
||||
CRASH_RECOVERY_ENABLED = _env_bool("CRASH_RECOVERY_ENABLED", False)
|
||||
CRASH_RECOVERY_GRACE_SECONDS = int(os.getenv("CRASH_RECOVERY_GRACE_SECONDS", "180"))
|
||||
|
||||
logging.info(
|
||||
"Scheduler config: poll_interval=%ss refresh_seconds=%s power_intent_enabled=%s "
|
||||
"power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss",
|
||||
"power_intent_heartbeat=%s power_intent_expiry_multiplier=%s power_intent_min_expiry=%ss "
|
||||
"crash_recovery_enabled=%s crash_recovery_grace=%ss",
|
||||
POLL_INTERVAL,
|
||||
REFRESH_SECONDS,
|
||||
POWER_INTENT_PUBLISH_ENABLED,
|
||||
POWER_INTENT_HEARTBEAT_ENABLED,
|
||||
POWER_INTENT_EXPIRY_MULTIPLIER,
|
||||
POWER_INTENT_MIN_EXPIRY_SECONDS,
|
||||
CRASH_RECOVERY_ENABLED,
|
||||
CRASH_RECOVERY_GRACE_SECONDS,
|
||||
)
|
||||
# Konfigurierbares Zeitfenster in Tagen (Standard: 7)
|
||||
WINDOW_DAYS = int(os.getenv("EVENTS_WINDOW_DAYS", "7"))
|
||||
@@ -275,8 +309,15 @@ def main():
|
||||
|
||||
client.on_connect = on_connect
|
||||
|
||||
client.connect("mqtt", 1883)
|
||||
client.connect(MQTT_BROKER_HOST, MQTT_BROKER_PORT)
|
||||
client.loop_start()
|
||||
logging.info(
|
||||
"MQTT connection configured host=%s port=%s tls=%s auth=%s",
|
||||
MQTT_BROKER_HOST,
|
||||
MQTT_BROKER_PORT,
|
||||
MQTT_TLS_ENABLED,
|
||||
bool(MQTT_USERNAME and MQTT_PASSWORD),
|
||||
)
|
||||
|
||||
while True:
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
@@ -390,6 +431,51 @@ def main():
|
||||
power_intent_metrics["retained_republish_total"],
|
||||
)
|
||||
|
||||
if CRASH_RECOVERY_ENABLED:
|
||||
try:
|
||||
candidates = get_crash_recovery_candidates(CRASH_RECOVERY_GRACE_SECONDS)
|
||||
if candidates:
|
||||
logging.info("event=crash_recovery_scan candidates=%s", len(candidates))
|
||||
for candidate in candidates:
|
||||
cuuid = candidate["uuid"]
|
||||
reason = candidate["reason"]
|
||||
try:
|
||||
command_id, payload, topic, compat_topic = issue_crash_recovery_command(
|
||||
client_uuid=cuuid,
|
||||
reason=reason,
|
||||
)
|
||||
result = client.publish(topic, json.dumps(payload), qos=1, retain=False)
|
||||
result.wait_for_publish(timeout=5.0)
|
||||
compat_result = client.publish(compat_topic, json.dumps(payload), qos=1, retain=False)
|
||||
compat_result.wait_for_publish(timeout=5.0)
|
||||
success = result.rc == mqtt.MQTT_ERR_SUCCESS
|
||||
error = None if success else mqtt.error_string(result.rc)
|
||||
finalize_crash_recovery_command(command_id, published=success, error=error)
|
||||
if success:
|
||||
logging.info(
|
||||
"event=crash_recovery_command_issued client_uuid=%s reason=%s command_id=%s",
|
||||
cuuid, reason, command_id,
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"event=crash_recovery_publish_failed client_uuid=%s reason=%s command_id=%s error=%s",
|
||||
cuuid, reason, command_id, error,
|
||||
)
|
||||
except Exception as cmd_exc:
|
||||
logging.error(
|
||||
"event=crash_recovery_command_error client_uuid=%s reason=%s error=%s",
|
||||
cuuid, reason, cmd_exc,
|
||||
)
|
||||
except Exception as scan_exc:
|
||||
logging.error("event=crash_recovery_scan_error error=%s", scan_exc)
|
||||
|
||||
try:
|
||||
expired_count = sweep_expired_commands()
|
||||
if expired_count:
|
||||
logging.info("event=command_expiry_sweep expired=%s", expired_count)
|
||||
except Exception as sweep_exc:
|
||||
logging.error("event=command_expiry_sweep_error error=%s", sweep_exc)
|
||||
|
||||
time.sleep(POLL_INTERVAL)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user