feat: remote commands, systemd units, process observability, broker auth split

- Command intake (reboot/shutdown) on infoscreen/{uuid}/commands with ack lifecycle
- MQTT_USER/MQTT_PASSWORD_BROKER split from identity vars; configure_mqtt_security() updated
- infoscreen-simclient.service: Type=notify, WatchdogSec=60, Restart=on-failure
- infoscreen-notify-failure@.service + script: retained MQTT alert when systemd gives up (Gap 3)
- _sd_notify() watchdog keepalive in simclient main loop (Gap 1)
- broker_connection block in health payload: reconnect_count, last_disconnect_at (Gap 2)
- COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE canary flag with safety guard
- SERVER_TEAM_ACTIONS.md: server-side integration action items
- Docs: README, CHANGELOG, src/README, copilot-instructions updated
- 43 tests passing
This commit is contained in:
RobbStarkAustria
2026-04-05 08:36:50 +02:00
parent 82f43f75ba
commit 0cd0d95612
28 changed files with 2487 additions and 36 deletions

View File

@@ -7,18 +7,39 @@ import json
import socket
import hashlib
import paho.mqtt.client as mqtt
import ssl
import os
import shutil
import re
import platform
import logging
import subprocess
from dotenv import load_dotenv
import requests
import base64
from datetime import datetime, timezone
from datetime import datetime, timezone, timedelta
import threading
from urllib.parse import urlsplit, urlunsplit, unquote
def _sd_notify(msg: str) -> None:
"""Send a sd_notify message to systemd via NOTIFY_SOCKET.
Uses raw socket so no extra package (systemd-python) is required.
Safe to call when not running under systemd (NOTIFY_SOCKET unset).
"""
sock_path = os.environ.get("NOTIFY_SOCKET")
if not sock_path:
return
try:
addr = sock_path.lstrip("@") # abstract namespace sockets start with '@'
with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as s:
s.connect(addr)
s.sendall(msg.encode())
except Exception:
pass # never crash the app over a watchdog notification
# ENV laden - support both container and native development
env_paths = [
"/workspace/simclient/.env", # Container path
@@ -96,6 +117,18 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG" if ENV == "development" else "INFO")
MQTT_BROKER = _env_host("MQTT_BROKER", "localhost" if ENV == "development" else "mqtt")
MQTT_PORT = _env_int("MQTT_PORT", 1883)
DEBUG_MODE = _env_bool("DEBUG_MODE", ENV == "development")
MQTT_USER = _env_str_clean("MQTT_USER", "")
MQTT_PASSWORD_BROKER = _env_str_clean("MQTT_PASSWORD_BROKER", "")
MQTT_USERNAME = _env_str_clean("MQTT_USERNAME", "")
MQTT_PASSWORD = _env_str_clean("MQTT_PASSWORD", "")
MQTT_TLS_CA_CERT = _env_str_clean("MQTT_TLS_CA_CERT", "")
MQTT_TLS_CERT = _env_str_clean("MQTT_TLS_CERT", "")
MQTT_TLS_KEY = _env_str_clean("MQTT_TLS_KEY", "")
MQTT_TLS_INSECURE = _env_bool("MQTT_TLS_INSECURE", False)
MQTT_TLS_ENABLED = _env_bool(
"MQTT_TLS_ENABLED",
bool(MQTT_TLS_CA_CERT or MQTT_TLS_CERT or MQTT_TLS_KEY),
)
MQTT_BROKER_FALLBACKS = []
_fallbacks_raw = os.getenv("MQTT_BROKER_FALLBACKS", "")
if _fallbacks_raw:
@@ -150,6 +183,48 @@ SCREENSHOT_META_FILE = os.path.join(SCREENSHOT_DIR, "meta.json")
POWER_CONTROL_MODE = os.getenv("POWER_CONTROL_MODE", "local").strip().lower()
POWER_INTENT_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_intent_state.json")
POWER_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_state.json")
COMMAND_STATE_DIR = os.path.join(os.path.dirname(__file__), "config")
PROCESSED_COMMANDS_FILE = os.path.join(COMMAND_STATE_DIR, "processed_commands.json")
LAST_COMMAND_STATE_FILE = os.path.join(COMMAND_STATE_DIR, "last_command_state.json")
COMMAND_HELPER_PATH = os.getenv("COMMAND_HELPER_PATH", "/usr/local/bin/infoscreen-cmd-helper.sh")
COMMAND_EXEC_TIMEOUT_SEC = _env_int("COMMAND_EXEC_TIMEOUT_SEC", 15)
COMMAND_DEDUPE_TTL_HOURS = _env_int("COMMAND_DEDUPE_TTL_HOURS", 24)
COMMAND_DEDUPE_MAX_ENTRIES = _env_int("COMMAND_DEDUPE_MAX_ENTRIES", 5000)
COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE = _env_bool("COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", False)
NIL_COMMAND_ID = "00000000-0000-0000-0000-000000000000"
COMMAND_ACTIONS = ("reboot_host", "shutdown_host")
ACK_STATUSES = ("accepted", "execution_started", "completed", "failed")
COMMAND_ERROR_CODES = {
"invalid_schema",
"missing_field",
"stale_command",
"duplicate_command",
"permission_denied_local",
"execution_timeout",
"execution_failed",
"broker_unavailable",
"internal_error",
}
def command_requires_recovery_completion(action):
return action == "reboot_host"
def command_mock_reboot_immediate_complete_enabled(action):
if action != "reboot_host" or not COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE:
return False
helper_basename = os.path.basename((COMMAND_HELPER_PATH or "").strip())
if helper_basename == "mock-command-helper.sh":
return True
logging.warning(
"Ignoring COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE because helper is not mock: %s",
COMMAND_HELPER_PATH,
)
return False
discovered = False
@@ -361,6 +436,246 @@ def write_power_intent_state(data):
logging.error(f"Error writing power intent state: {e}")
def _atomic_write_json(path, data):
os.makedirs(os.path.dirname(path), exist_ok=True)
tmp_path = path + ".tmp"
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
os.replace(tmp_path, path)
def _read_json_or_default(path, default):
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return default
def _extract_command_id(payload):
if isinstance(payload, dict):
value = payload.get("command_id")
if isinstance(value, str) and value.strip():
return value.strip()
return NIL_COMMAND_ID
def _as_uuid_str(value):
if not isinstance(value, str) or not value.strip():
raise ValueError("must be a non-empty string")
return str(uuid.UUID(value.strip()))
def _prune_processed_commands(commands):
if not isinstance(commands, dict):
return {}
cutoff = datetime.now(timezone.utc) - timedelta(hours=max(1, COMMAND_DEDUPE_TTL_HOURS))
kept = {}
sortable = []
for command_id, entry in commands.items():
if not isinstance(entry, dict):
continue
processed_at = entry.get("processed_at")
if not processed_at:
continue
try:
processed_dt = _parse_utc_iso(processed_at)
except Exception:
continue
if processed_dt < cutoff:
continue
kept[command_id] = entry
sortable.append((processed_dt, command_id))
max_entries = max(1, COMMAND_DEDUPE_MAX_ENTRIES)
if len(sortable) > max_entries:
sortable.sort(reverse=True)
keep_ids = {cid for _, cid in sortable[:max_entries]}
kept = {cid: entry for cid, entry in kept.items() if cid in keep_ids}
return kept
def load_processed_commands():
state = _read_json_or_default(PROCESSED_COMMANDS_FILE, {"commands": {}})
commands = state.get("commands") if isinstance(state, dict) else {}
commands = _prune_processed_commands(commands)
_atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": commands})
return commands
def persist_processed_commands(commands):
sanitized = _prune_processed_commands(commands)
_atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": sanitized})
def load_last_command_state():
state = _read_json_or_default(LAST_COMMAND_STATE_FILE, {})
if isinstance(state, dict):
return state
return {}
def write_last_command_state(data):
_atomic_write_json(LAST_COMMAND_STATE_FILE, data)
def validate_command_payload(payload, expected_client_uuid):
if not isinstance(payload, dict):
return False, None, "invalid_schema", "payload must be a JSON object"
required = {
"schema_version",
"command_id",
"client_uuid",
"action",
"issued_at",
"expires_at",
"requested_by",
"reason",
}
payload_keys = set(payload.keys())
missing = sorted(required - payload_keys)
if missing:
return False, None, "missing_field", f"missing required fields: {', '.join(missing)}"
extras = sorted(payload_keys - required)
if extras:
return False, None, "invalid_schema", f"unexpected fields: {', '.join(extras)}"
if payload.get("schema_version") != "1.0":
return False, None, "invalid_schema", "schema_version must be 1.0"
try:
command_id = _as_uuid_str(payload.get("command_id"))
except Exception:
return False, None, "invalid_schema", "command_id must be a valid UUID"
try:
client_uuid = _as_uuid_str(payload.get("client_uuid"))
except Exception:
return False, None, "invalid_schema", "client_uuid must be a valid UUID"
try:
expected_uuid = _as_uuid_str(expected_client_uuid)
except Exception:
expected_uuid = str(expected_client_uuid).strip()
if client_uuid != expected_uuid:
return False, None, "invalid_schema", "client_uuid does not match this client"
action = payload.get("action")
if action not in COMMAND_ACTIONS:
return False, None, "invalid_schema", f"action must be one of {COMMAND_ACTIONS}"
try:
issued_at = _parse_utc_iso(payload.get("issued_at"))
expires_at = _parse_utc_iso(payload.get("expires_at"))
except Exception as e:
return False, None, "invalid_schema", f"invalid timestamp: {e}"
if expires_at <= issued_at:
return False, None, "invalid_schema", "expires_at must be later than issued_at"
if datetime.now(timezone.utc) > expires_at:
return False, None, "stale_command", "command expired"
requested_by = payload.get("requested_by")
if requested_by is not None:
if not isinstance(requested_by, int) or requested_by < 1:
return False, None, "invalid_schema", "requested_by must be integer >= 1 or null"
reason = payload.get("reason")
if reason is not None:
if not isinstance(reason, str):
return False, None, "invalid_schema", "reason must be string or null"
if len(reason) > 2000:
return False, None, "invalid_schema", "reason exceeds max length 2000"
normalized = {
"schema_version": "1.0",
"command_id": command_id,
"client_uuid": client_uuid,
"action": action,
"issued_at": issued_at.strftime("%Y-%m-%dT%H:%M:%SZ"),
"expires_at": expires_at.strftime("%Y-%m-%dT%H:%M:%SZ"),
"requested_by": requested_by,
"reason": reason,
}
return True, normalized, None, None
def publish_command_ack(
client,
client_uuid,
command_id,
status,
error_code=None,
error_message=None,
expires_at=None,
):
if status not in ACK_STATUSES:
raise ValueError(f"invalid ack status: {status}")
if status == "failed":
if not isinstance(error_code, str) or not error_code.strip():
error_code = "internal_error"
if not isinstance(error_message, str) or not error_message.strip():
error_message = "failed without diagnostic message"
else:
error_code = None
error_message = None
if isinstance(error_code, str):
error_code = error_code[:128]
if isinstance(error_message, str):
error_message = error_message[:4000]
ack_payload = {
"command_id": command_id,
"status": status,
"error_code": error_code,
"error_message": error_message,
}
encoded = json.dumps(ack_payload)
ack_topics = [
f"infoscreen/{client_uuid}/commands/ack",
f"infoscreen/{client_uuid}/command/ack",
]
retry_schedule = [0.5, 1, 2, 4, 5]
attempt = 0
while True:
all_ok = True
for topic in ack_topics:
result = client.publish(topic, encoded, qos=1, retain=False)
if result.rc != mqtt.MQTT_ERR_SUCCESS:
all_ok = False
logging.warning(
"Command ack publish failed: topic=%s status=%s rc=%s",
topic,
status,
result.rc,
)
if all_ok:
logging.info("Command ack published: command_id=%s status=%s", command_id, status)
return True
if expires_at:
try:
if datetime.now(timezone.utc) >= _parse_utc_iso(expires_at):
logging.warning("Command ack retry stopped at expiry: command_id=%s", command_id)
return False
except Exception:
pass
delay = retry_schedule[min(attempt, len(retry_schedule) - 1)]
attempt += 1
time.sleep(delay)
def on_message(client, userdata, msg, properties=None):
global discovered
logging.info(f"Received: {msg.topic} {msg.payload.decode()}")
@@ -482,6 +797,71 @@ def get_model():
SOFTWARE_VERSION = "1.0.0" # Optional: Anpassen bei neuen Releases
def _detect_watchdog_enabled():
env_flag = os.getenv("WATCHDOG_ENABLED", "").strip().lower()
if env_flag in ("1", "true", "yes", "on"):
return True
if os.path.exists("/dev/watchdog"):
return True
return False
def _detect_boot_source():
try:
with open("/proc/cmdline", "r", encoding="utf-8") as f:
cmdline = f.read().strip()
for token in cmdline.split():
if token.startswith("root="):
return token.split("=", 1)[1]
except Exception:
pass
return "unknown"
def configure_mqtt_security(client):
# Prefer broker-scoped auth vars when present, fallback to legacy vars.
auth_username = MQTT_USER or MQTT_USERNAME
auth_password = MQTT_PASSWORD_BROKER if MQTT_USER else MQTT_PASSWORD
configured = {
"username": bool(auth_username),
"tls": False,
"tls_insecure": False,
}
if auth_username:
client.username_pw_set(auth_username, auth_password or None)
configured["username"] = True
logging.info("Configured MQTT username/password authentication")
if not MQTT_TLS_ENABLED:
return configured
tls_kwargs = {
"ca_certs": MQTT_TLS_CA_CERT or None,
"certfile": MQTT_TLS_CERT or None,
"keyfile": MQTT_TLS_KEY or None,
"tls_version": ssl.PROTOCOL_TLS_CLIENT,
}
client.tls_set(**tls_kwargs)
configured["tls"] = True
if MQTT_TLS_INSECURE:
client.tls_insecure_set(True)
configured["tls_insecure"] = True
logging.warning("MQTT TLS hostname verification disabled via MQTT_TLS_INSECURE")
else:
client.tls_insecure_set(False)
logging.info(
"Configured MQTT TLS: ca=%s client_cert=%s client_key=%s",
bool(MQTT_TLS_CA_CERT),
bool(MQTT_TLS_CERT),
bool(MQTT_TLS_KEY),
)
return configured
def send_discovery(client, client_id, hardware_token, ip_addr):
macs = get_mac_addresses()
discovery_msg = {
@@ -494,6 +874,12 @@ def send_discovery(client, client_id, hardware_token, ip_addr):
"software_version": SOFTWARE_VERSION,
"macs": macs,
"model": get_model(),
"capabilities": {
"recovery_class": "software_only",
"watchdog_enabled": _detect_watchdog_enabled(),
"boot_source": _detect_boot_source(),
"command_schema_version": "1.0",
},
}
client.publish("infoscreen/discovery", json.dumps(discovery_msg))
logging.info(f"Discovery message sent: {discovery_msg}")
@@ -766,7 +1152,7 @@ def delete_client_settings():
logging.error(f"Error deleting client settings: {e}")
def publish_health_message(client, client_id):
def publish_health_message(client, client_id, connection_state=None):
"""Publish health status to server via MQTT"""
try:
health = read_health_state()
@@ -784,6 +1170,17 @@ def publish_health_message(client, client_id):
"status": health.get("process_status")
}
}
if connection_state is not None:
last_disc = connection_state.get("last_disconnect")
payload["broker_connection"] = {
"broker_reachable": bool(connection_state.get("connected")),
"reconnect_count": connection_state.get("reconnect_count", 0),
"last_disconnect_at": (
datetime.fromtimestamp(last_disc, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
if last_disc else None
),
}
topic = f"infoscreen/{client_id}/health"
res = client.publish(topic, json.dumps(payload), qos=1)
@@ -1007,6 +1404,10 @@ def main():
power_intent_topic = None
last_power_intent_id = None
last_power_issued_at = None
command_topic = f"infoscreen/{client_id}/commands"
command_topic_alias = f"infoscreen/{client_id}/command"
processed_commands = load_processed_commands()
pending_recovery_command = load_last_command_state()
# paho-mqtt v2: opt into latest callback API to avoid deprecation warnings.
client_kwargs = {"protocol": mqtt.MQTTv311}
@@ -1018,12 +1419,18 @@ def main():
pass
client = mqtt.Client(**client_kwargs)
client.on_message = on_message
configure_mqtt_security(client)
# Enable automatic reconnection
client.reconnect_delay_set(min_delay=1, max_delay=120)
# Connection state tracking
connection_state = {"connected": False, "last_disconnect": None}
connection_state = {
"connected": False,
"last_disconnect": None,
"reconnect_count": 0,
"connect_count": 0,
}
# Optional: Enable MQTT debug logging in DEBUG_MODE
if DEBUG_MODE:
@@ -1090,6 +1497,9 @@ def main():
if rc == 0:
connection_state["connected"] = True
connection_state["last_disconnect"] = None
connection_state["connect_count"] = connection_state.get("connect_count", 0) + 1
if connection_state["connect_count"] > 1:
connection_state["reconnect_count"] = connection_state.get("reconnect_count", 0) + 1
# Check if this is a reconnection
# paho-mqtt v2 provides ConnectFlags with attribute 'session_present'
@@ -1121,6 +1531,11 @@ def main():
group_id_topic = f"infoscreen/{client_id}/group_id"
client.subscribe(group_id_topic)
logging.info(f"Subscribed to: {group_id_topic}")
# Command topics (canonical + transitional)
client.subscribe(command_topic, qos=1)
client.subscribe(command_topic_alias, qos=1)
logging.info(f"Subscribed to command topics: {command_topic}, {command_topic_alias}")
# Wenn beim Start eine group_id vorhanden ist, sofort Event-Topic abonnieren
# Reset event_topic so subscribe_event_topic always re-registers with the broker
@@ -1233,6 +1648,189 @@ def main():
client.message_callback_add(group_id_topic, on_group_id_message)
logging.info(f"Current group_id at start: {current_group_id if current_group_id else 'none'}")
def mark_command_processed(command_id, status, error_code=None):
processed_commands[command_id] = {
"status": status,
"error_code": error_code,
"processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
}
persist_processed_commands(processed_commands)
def execute_command_action(action):
try:
proc = subprocess.run(
["sudo", COMMAND_HELPER_PATH, action],
timeout=max(1, COMMAND_EXEC_TIMEOUT_SEC),
check=False,
capture_output=True,
text=True,
)
except subprocess.TimeoutExpired:
return False, "execution_timeout", f"command timed out after {COMMAND_EXEC_TIMEOUT_SEC}s"
except PermissionError:
return False, "permission_denied_local", "permission denied invoking command helper"
except Exception as e:
return False, "internal_error", f"internal execution error: {e}"
if proc.returncode != 0:
stderr = (proc.stderr or "").strip()
if proc.returncode in (126, 127):
return False, "permission_denied_local", stderr or "command helper unavailable"
return False, "execution_failed", stderr or f"helper exited with code {proc.returncode}"
return True, None, None
def on_command_message(client, userdata, msg, properties=None):
payload_text = msg.payload.decode().strip()
received_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
expires_at = None
if not payload_text:
publish_command_ack(
client,
client_id,
NIL_COMMAND_ID,
"failed",
error_code="invalid_schema",
error_message="empty command payload",
)
return
try:
payload = json.loads(payload_text)
except json.JSONDecodeError as e:
publish_command_ack(
client,
client_id,
NIL_COMMAND_ID,
"failed",
error_code="invalid_schema",
error_message=f"invalid JSON: {e}",
)
return
command_id_hint = _extract_command_id(payload)
if isinstance(payload, dict):
expires_at = payload.get("expires_at")
is_valid, normalized, error_code, error_message = validate_command_payload(payload, client_id)
if not is_valid:
publish_command_ack(
client,
client_id,
command_id_hint,
"failed",
error_code=error_code,
error_message=error_message,
expires_at=expires_at,
)
return
command_id = normalized["command_id"]
expires_at = normalized["expires_at"]
action = normalized["action"]
if command_id in processed_commands:
publish_command_ack(
client,
client_id,
command_id,
"failed",
error_code="duplicate_command",
error_message="command_id already processed",
expires_at=expires_at,
)
return
publish_command_ack(
client,
client_id,
command_id,
"accepted",
expires_at=expires_at,
)
publish_command_ack(
client,
client_id,
command_id,
"execution_started",
expires_at=expires_at,
)
write_last_command_state({
"command_id": command_id,
"action": action,
"ack_status": "execution_started",
"received_at": received_at,
"execution_started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"expires_at": expires_at,
"source_topic": msg.topic,
})
ok, exec_error_code, exec_error_message = execute_command_action(action)
if ok:
if command_requires_recovery_completion(action):
if command_mock_reboot_immediate_complete_enabled(action):
logging.info(
"Mock reboot immediate completion enabled: command_id=%s action=%s",
command_id,
action,
)
else:
logging.info(
"Command entered recovery completion path: command_id=%s action=%s",
command_id,
action,
)
return
logging.info(
"Command continuing to immediate completion path: command_id=%s action=%s",
command_id,
action,
)
publish_command_ack(
client,
client_id,
command_id,
"completed",
expires_at=expires_at,
)
mark_command_processed(command_id, "completed")
write_last_command_state({
"command_id": command_id,
"action": action,
"ack_status": "completed",
"completed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"expires_at": expires_at,
})
return
publish_command_ack(
client,
client_id,
command_id,
"failed",
error_code=exec_error_code,
error_message=exec_error_message,
expires_at=expires_at,
)
mark_command_processed(command_id, "failed", error_code=exec_error_code)
write_last_command_state({
"command_id": command_id,
"action": action,
"ack_status": "failed",
"error_code": exec_error_code,
"error_message": exec_error_message,
"failed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"expires_at": expires_at,
})
client.message_callback_add(command_topic, on_command_message)
client.message_callback_add(command_topic_alias, on_command_message)
def on_power_intent_message(client, userdata, msg, properties=None):
nonlocal last_power_intent_id, last_power_issued_at
@@ -1396,7 +1994,8 @@ def main():
# Heartbeat-Loop with connection state monitoring
last_heartbeat = 0
logging.info("Entering heartbeat loop (network loop already running in background thread)")
_sd_notify("READY=1") # tell systemd the process is fully initialised
while True:
try:
current_time = time.time()
@@ -1416,7 +2015,31 @@ def main():
if result.rc == mqtt.MQTT_ERR_SUCCESS:
logging.info("Heartbeat sent.")
# Also send health and screenshot heartbeats
publish_health_message(client, client_id)
publish_health_message(client, client_id, connection_state)
if (
isinstance(pending_recovery_command, dict)
and pending_recovery_command.get("ack_status") == "execution_started"
and pending_recovery_command.get("action") == "reboot_host"
and pending_recovery_command.get("command_id")
):
recovered_command_id = pending_recovery_command.get("command_id")
recovered_expires = pending_recovery_command.get("expires_at")
publish_command_ack(
client,
client_id,
recovered_command_id,
"completed",
expires_at=recovered_expires,
)
mark_command_processed(recovered_command_id, "completed")
write_last_command_state({
"command_id": recovered_command_id,
"action": pending_recovery_command.get("action"),
"ack_status": "recovered",
"recovered_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"expires_at": recovered_expires,
})
pending_recovery_command = None
elif result.rc == mqtt.MQTT_ERR_NO_CONN:
logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
time.sleep(2)
@@ -1424,7 +2047,7 @@ def main():
retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
if retry.rc == mqtt.MQTT_ERR_SUCCESS:
logging.info("Heartbeat sent after retry.")
publish_health_message(client, client_id)
publish_health_message(client, client_id, connection_state)
else:
logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
else:
@@ -1435,6 +2058,7 @@ def main():
logging.debug("Skipping heartbeat - MQTT not connected (is_connected=False)")
last_heartbeat = current_time
_sd_notify("WATCHDOG=1") # kick systemd watchdog each loop iteration
time.sleep(5)
except KeyboardInterrupt:
logging.info("Shutting down gracefully...")