feat: remote commands, systemd units, process observability, broker auth split
- Command intake (reboot/shutdown) on infoscreen/{uuid}/commands with ack lifecycle
- MQTT_USER/MQTT_PASSWORD_BROKER split from identity vars; configure_mqtt_security() updated
- infoscreen-simclient.service: Type=notify, WatchdogSec=60, Restart=on-failure
- infoscreen-notify-failure@.service + script: retained MQTT alert when systemd gives up (Gap 3)
- _sd_notify() watchdog keepalive in simclient main loop (Gap 1)
- broker_connection block in health payload: reconnect_count, last_disconnect_at (Gap 2)
- COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE canary flag with safety guard
- SERVER_TEAM_ACTIONS.md: server-side integration action items
- Docs: README, CHANGELOG, src/README, copilot-instructions updated
- 43 tests passing
This commit is contained in:
636
src/simclient.py
636
src/simclient.py
@@ -7,18 +7,39 @@ import json
|
||||
import socket
|
||||
import hashlib
|
||||
import paho.mqtt.client as mqtt
|
||||
import ssl
|
||||
import os
|
||||
import shutil
|
||||
import re
|
||||
import platform
|
||||
import logging
|
||||
import subprocess
|
||||
from dotenv import load_dotenv
|
||||
import requests
|
||||
import base64
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import threading
|
||||
from urllib.parse import urlsplit, urlunsplit, unquote
|
||||
|
||||
|
||||
def _sd_notify(msg: str) -> None:
|
||||
"""Send a sd_notify message to systemd via NOTIFY_SOCKET.
|
||||
|
||||
Uses raw socket so no extra package (systemd-python) is required.
|
||||
Safe to call when not running under systemd (NOTIFY_SOCKET unset).
|
||||
"""
|
||||
sock_path = os.environ.get("NOTIFY_SOCKET")
|
||||
if not sock_path:
|
||||
return
|
||||
try:
|
||||
addr = sock_path.lstrip("@") # abstract namespace sockets start with '@'
|
||||
with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as s:
|
||||
s.connect(addr)
|
||||
s.sendall(msg.encode())
|
||||
except Exception:
|
||||
pass # never crash the app over a watchdog notification
|
||||
|
||||
|
||||
# ENV laden - support both container and native development
|
||||
env_paths = [
|
||||
"/workspace/simclient/.env", # Container path
|
||||
@@ -96,6 +117,18 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG" if ENV == "development" else "INFO")
|
||||
MQTT_BROKER = _env_host("MQTT_BROKER", "localhost" if ENV == "development" else "mqtt")
|
||||
MQTT_PORT = _env_int("MQTT_PORT", 1883)
|
||||
DEBUG_MODE = _env_bool("DEBUG_MODE", ENV == "development")
|
||||
MQTT_USER = _env_str_clean("MQTT_USER", "")
|
||||
MQTT_PASSWORD_BROKER = _env_str_clean("MQTT_PASSWORD_BROKER", "")
|
||||
MQTT_USERNAME = _env_str_clean("MQTT_USERNAME", "")
|
||||
MQTT_PASSWORD = _env_str_clean("MQTT_PASSWORD", "")
|
||||
MQTT_TLS_CA_CERT = _env_str_clean("MQTT_TLS_CA_CERT", "")
|
||||
MQTT_TLS_CERT = _env_str_clean("MQTT_TLS_CERT", "")
|
||||
MQTT_TLS_KEY = _env_str_clean("MQTT_TLS_KEY", "")
|
||||
MQTT_TLS_INSECURE = _env_bool("MQTT_TLS_INSECURE", False)
|
||||
MQTT_TLS_ENABLED = _env_bool(
|
||||
"MQTT_TLS_ENABLED",
|
||||
bool(MQTT_TLS_CA_CERT or MQTT_TLS_CERT or MQTT_TLS_KEY),
|
||||
)
|
||||
MQTT_BROKER_FALLBACKS = []
|
||||
_fallbacks_raw = os.getenv("MQTT_BROKER_FALLBACKS", "")
|
||||
if _fallbacks_raw:
|
||||
@@ -150,6 +183,48 @@ SCREENSHOT_META_FILE = os.path.join(SCREENSHOT_DIR, "meta.json")
|
||||
POWER_CONTROL_MODE = os.getenv("POWER_CONTROL_MODE", "local").strip().lower()
|
||||
POWER_INTENT_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_intent_state.json")
|
||||
POWER_STATE_FILE = os.path.join(os.path.dirname(__file__), "power_state.json")
|
||||
COMMAND_STATE_DIR = os.path.join(os.path.dirname(__file__), "config")
|
||||
PROCESSED_COMMANDS_FILE = os.path.join(COMMAND_STATE_DIR, "processed_commands.json")
|
||||
LAST_COMMAND_STATE_FILE = os.path.join(COMMAND_STATE_DIR, "last_command_state.json")
|
||||
COMMAND_HELPER_PATH = os.getenv("COMMAND_HELPER_PATH", "/usr/local/bin/infoscreen-cmd-helper.sh")
|
||||
COMMAND_EXEC_TIMEOUT_SEC = _env_int("COMMAND_EXEC_TIMEOUT_SEC", 15)
|
||||
COMMAND_DEDUPE_TTL_HOURS = _env_int("COMMAND_DEDUPE_TTL_HOURS", 24)
|
||||
COMMAND_DEDUPE_MAX_ENTRIES = _env_int("COMMAND_DEDUPE_MAX_ENTRIES", 5000)
|
||||
COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE = _env_bool("COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE", False)
|
||||
|
||||
NIL_COMMAND_ID = "00000000-0000-0000-0000-000000000000"
|
||||
COMMAND_ACTIONS = ("reboot_host", "shutdown_host")
|
||||
ACK_STATUSES = ("accepted", "execution_started", "completed", "failed")
|
||||
COMMAND_ERROR_CODES = {
|
||||
"invalid_schema",
|
||||
"missing_field",
|
||||
"stale_command",
|
||||
"duplicate_command",
|
||||
"permission_denied_local",
|
||||
"execution_timeout",
|
||||
"execution_failed",
|
||||
"broker_unavailable",
|
||||
"internal_error",
|
||||
}
|
||||
|
||||
|
||||
def command_requires_recovery_completion(action):
|
||||
return action == "reboot_host"
|
||||
|
||||
|
||||
def command_mock_reboot_immediate_complete_enabled(action):
|
||||
if action != "reboot_host" or not COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE:
|
||||
return False
|
||||
|
||||
helper_basename = os.path.basename((COMMAND_HELPER_PATH or "").strip())
|
||||
if helper_basename == "mock-command-helper.sh":
|
||||
return True
|
||||
|
||||
logging.warning(
|
||||
"Ignoring COMMAND_MOCK_REBOOT_IMMEDIATE_COMPLETE because helper is not mock: %s",
|
||||
COMMAND_HELPER_PATH,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
discovered = False
|
||||
@@ -361,6 +436,246 @@ def write_power_intent_state(data):
|
||||
logging.error(f"Error writing power intent state: {e}")
|
||||
|
||||
|
||||
def _atomic_write_json(path, data):
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
tmp_path = path + ".tmp"
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
os.replace(tmp_path, path)
|
||||
|
||||
|
||||
def _read_json_or_default(path, default):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def _extract_command_id(payload):
|
||||
if isinstance(payload, dict):
|
||||
value = payload.get("command_id")
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip()
|
||||
return NIL_COMMAND_ID
|
||||
|
||||
|
||||
def _as_uuid_str(value):
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
raise ValueError("must be a non-empty string")
|
||||
return str(uuid.UUID(value.strip()))
|
||||
|
||||
|
||||
def _prune_processed_commands(commands):
|
||||
if not isinstance(commands, dict):
|
||||
return {}
|
||||
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(hours=max(1, COMMAND_DEDUPE_TTL_HOURS))
|
||||
kept = {}
|
||||
sortable = []
|
||||
|
||||
for command_id, entry in commands.items():
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
processed_at = entry.get("processed_at")
|
||||
if not processed_at:
|
||||
continue
|
||||
try:
|
||||
processed_dt = _parse_utc_iso(processed_at)
|
||||
except Exception:
|
||||
continue
|
||||
if processed_dt < cutoff:
|
||||
continue
|
||||
kept[command_id] = entry
|
||||
sortable.append((processed_dt, command_id))
|
||||
|
||||
max_entries = max(1, COMMAND_DEDUPE_MAX_ENTRIES)
|
||||
if len(sortable) > max_entries:
|
||||
sortable.sort(reverse=True)
|
||||
keep_ids = {cid for _, cid in sortable[:max_entries]}
|
||||
kept = {cid: entry for cid, entry in kept.items() if cid in keep_ids}
|
||||
|
||||
return kept
|
||||
|
||||
|
||||
def load_processed_commands():
|
||||
state = _read_json_or_default(PROCESSED_COMMANDS_FILE, {"commands": {}})
|
||||
commands = state.get("commands") if isinstance(state, dict) else {}
|
||||
commands = _prune_processed_commands(commands)
|
||||
_atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": commands})
|
||||
return commands
|
||||
|
||||
|
||||
def persist_processed_commands(commands):
|
||||
sanitized = _prune_processed_commands(commands)
|
||||
_atomic_write_json(PROCESSED_COMMANDS_FILE, {"commands": sanitized})
|
||||
|
||||
|
||||
def load_last_command_state():
|
||||
state = _read_json_or_default(LAST_COMMAND_STATE_FILE, {})
|
||||
if isinstance(state, dict):
|
||||
return state
|
||||
return {}
|
||||
|
||||
|
||||
def write_last_command_state(data):
|
||||
_atomic_write_json(LAST_COMMAND_STATE_FILE, data)
|
||||
|
||||
|
||||
def validate_command_payload(payload, expected_client_uuid):
|
||||
if not isinstance(payload, dict):
|
||||
return False, None, "invalid_schema", "payload must be a JSON object"
|
||||
|
||||
required = {
|
||||
"schema_version",
|
||||
"command_id",
|
||||
"client_uuid",
|
||||
"action",
|
||||
"issued_at",
|
||||
"expires_at",
|
||||
"requested_by",
|
||||
"reason",
|
||||
}
|
||||
payload_keys = set(payload.keys())
|
||||
missing = sorted(required - payload_keys)
|
||||
if missing:
|
||||
return False, None, "missing_field", f"missing required fields: {', '.join(missing)}"
|
||||
extras = sorted(payload_keys - required)
|
||||
if extras:
|
||||
return False, None, "invalid_schema", f"unexpected fields: {', '.join(extras)}"
|
||||
|
||||
if payload.get("schema_version") != "1.0":
|
||||
return False, None, "invalid_schema", "schema_version must be 1.0"
|
||||
|
||||
try:
|
||||
command_id = _as_uuid_str(payload.get("command_id"))
|
||||
except Exception:
|
||||
return False, None, "invalid_schema", "command_id must be a valid UUID"
|
||||
|
||||
try:
|
||||
client_uuid = _as_uuid_str(payload.get("client_uuid"))
|
||||
except Exception:
|
||||
return False, None, "invalid_schema", "client_uuid must be a valid UUID"
|
||||
|
||||
try:
|
||||
expected_uuid = _as_uuid_str(expected_client_uuid)
|
||||
except Exception:
|
||||
expected_uuid = str(expected_client_uuid).strip()
|
||||
|
||||
if client_uuid != expected_uuid:
|
||||
return False, None, "invalid_schema", "client_uuid does not match this client"
|
||||
|
||||
action = payload.get("action")
|
||||
if action not in COMMAND_ACTIONS:
|
||||
return False, None, "invalid_schema", f"action must be one of {COMMAND_ACTIONS}"
|
||||
|
||||
try:
|
||||
issued_at = _parse_utc_iso(payload.get("issued_at"))
|
||||
expires_at = _parse_utc_iso(payload.get("expires_at"))
|
||||
except Exception as e:
|
||||
return False, None, "invalid_schema", f"invalid timestamp: {e}"
|
||||
|
||||
if expires_at <= issued_at:
|
||||
return False, None, "invalid_schema", "expires_at must be later than issued_at"
|
||||
if datetime.now(timezone.utc) > expires_at:
|
||||
return False, None, "stale_command", "command expired"
|
||||
|
||||
requested_by = payload.get("requested_by")
|
||||
if requested_by is not None:
|
||||
if not isinstance(requested_by, int) or requested_by < 1:
|
||||
return False, None, "invalid_schema", "requested_by must be integer >= 1 or null"
|
||||
|
||||
reason = payload.get("reason")
|
||||
if reason is not None:
|
||||
if not isinstance(reason, str):
|
||||
return False, None, "invalid_schema", "reason must be string or null"
|
||||
if len(reason) > 2000:
|
||||
return False, None, "invalid_schema", "reason exceeds max length 2000"
|
||||
|
||||
normalized = {
|
||||
"schema_version": "1.0",
|
||||
"command_id": command_id,
|
||||
"client_uuid": client_uuid,
|
||||
"action": action,
|
||||
"issued_at": issued_at.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"expires_at": expires_at.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"requested_by": requested_by,
|
||||
"reason": reason,
|
||||
}
|
||||
return True, normalized, None, None
|
||||
|
||||
|
||||
def publish_command_ack(
|
||||
client,
|
||||
client_uuid,
|
||||
command_id,
|
||||
status,
|
||||
error_code=None,
|
||||
error_message=None,
|
||||
expires_at=None,
|
||||
):
|
||||
if status not in ACK_STATUSES:
|
||||
raise ValueError(f"invalid ack status: {status}")
|
||||
|
||||
if status == "failed":
|
||||
if not isinstance(error_code, str) or not error_code.strip():
|
||||
error_code = "internal_error"
|
||||
if not isinstance(error_message, str) or not error_message.strip():
|
||||
error_message = "failed without diagnostic message"
|
||||
else:
|
||||
error_code = None
|
||||
error_message = None
|
||||
|
||||
if isinstance(error_code, str):
|
||||
error_code = error_code[:128]
|
||||
if isinstance(error_message, str):
|
||||
error_message = error_message[:4000]
|
||||
|
||||
ack_payload = {
|
||||
"command_id": command_id,
|
||||
"status": status,
|
||||
"error_code": error_code,
|
||||
"error_message": error_message,
|
||||
}
|
||||
encoded = json.dumps(ack_payload)
|
||||
|
||||
ack_topics = [
|
||||
f"infoscreen/{client_uuid}/commands/ack",
|
||||
f"infoscreen/{client_uuid}/command/ack",
|
||||
]
|
||||
retry_schedule = [0.5, 1, 2, 4, 5]
|
||||
attempt = 0
|
||||
|
||||
while True:
|
||||
all_ok = True
|
||||
for topic in ack_topics:
|
||||
result = client.publish(topic, encoded, qos=1, retain=False)
|
||||
if result.rc != mqtt.MQTT_ERR_SUCCESS:
|
||||
all_ok = False
|
||||
logging.warning(
|
||||
"Command ack publish failed: topic=%s status=%s rc=%s",
|
||||
topic,
|
||||
status,
|
||||
result.rc,
|
||||
)
|
||||
|
||||
if all_ok:
|
||||
logging.info("Command ack published: command_id=%s status=%s", command_id, status)
|
||||
return True
|
||||
|
||||
if expires_at:
|
||||
try:
|
||||
if datetime.now(timezone.utc) >= _parse_utc_iso(expires_at):
|
||||
logging.warning("Command ack retry stopped at expiry: command_id=%s", command_id)
|
||||
return False
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
delay = retry_schedule[min(attempt, len(retry_schedule) - 1)]
|
||||
attempt += 1
|
||||
time.sleep(delay)
|
||||
|
||||
|
||||
def on_message(client, userdata, msg, properties=None):
|
||||
global discovered
|
||||
logging.info(f"Received: {msg.topic} {msg.payload.decode()}")
|
||||
@@ -482,6 +797,71 @@ def get_model():
|
||||
SOFTWARE_VERSION = "1.0.0" # Optional: Anpassen bei neuen Releases
|
||||
|
||||
|
||||
def _detect_watchdog_enabled():
|
||||
env_flag = os.getenv("WATCHDOG_ENABLED", "").strip().lower()
|
||||
if env_flag in ("1", "true", "yes", "on"):
|
||||
return True
|
||||
if os.path.exists("/dev/watchdog"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _detect_boot_source():
|
||||
try:
|
||||
with open("/proc/cmdline", "r", encoding="utf-8") as f:
|
||||
cmdline = f.read().strip()
|
||||
for token in cmdline.split():
|
||||
if token.startswith("root="):
|
||||
return token.split("=", 1)[1]
|
||||
except Exception:
|
||||
pass
|
||||
return "unknown"
|
||||
|
||||
|
||||
def configure_mqtt_security(client):
|
||||
# Prefer broker-scoped auth vars when present, fallback to legacy vars.
|
||||
auth_username = MQTT_USER or MQTT_USERNAME
|
||||
auth_password = MQTT_PASSWORD_BROKER if MQTT_USER else MQTT_PASSWORD
|
||||
|
||||
configured = {
|
||||
"username": bool(auth_username),
|
||||
"tls": False,
|
||||
"tls_insecure": False,
|
||||
}
|
||||
|
||||
if auth_username:
|
||||
client.username_pw_set(auth_username, auth_password or None)
|
||||
configured["username"] = True
|
||||
logging.info("Configured MQTT username/password authentication")
|
||||
|
||||
if not MQTT_TLS_ENABLED:
|
||||
return configured
|
||||
|
||||
tls_kwargs = {
|
||||
"ca_certs": MQTT_TLS_CA_CERT or None,
|
||||
"certfile": MQTT_TLS_CERT or None,
|
||||
"keyfile": MQTT_TLS_KEY or None,
|
||||
"tls_version": ssl.PROTOCOL_TLS_CLIENT,
|
||||
}
|
||||
client.tls_set(**tls_kwargs)
|
||||
configured["tls"] = True
|
||||
|
||||
if MQTT_TLS_INSECURE:
|
||||
client.tls_insecure_set(True)
|
||||
configured["tls_insecure"] = True
|
||||
logging.warning("MQTT TLS hostname verification disabled via MQTT_TLS_INSECURE")
|
||||
else:
|
||||
client.tls_insecure_set(False)
|
||||
|
||||
logging.info(
|
||||
"Configured MQTT TLS: ca=%s client_cert=%s client_key=%s",
|
||||
bool(MQTT_TLS_CA_CERT),
|
||||
bool(MQTT_TLS_CERT),
|
||||
bool(MQTT_TLS_KEY),
|
||||
)
|
||||
return configured
|
||||
|
||||
|
||||
def send_discovery(client, client_id, hardware_token, ip_addr):
|
||||
macs = get_mac_addresses()
|
||||
discovery_msg = {
|
||||
@@ -494,6 +874,12 @@ def send_discovery(client, client_id, hardware_token, ip_addr):
|
||||
"software_version": SOFTWARE_VERSION,
|
||||
"macs": macs,
|
||||
"model": get_model(),
|
||||
"capabilities": {
|
||||
"recovery_class": "software_only",
|
||||
"watchdog_enabled": _detect_watchdog_enabled(),
|
||||
"boot_source": _detect_boot_source(),
|
||||
"command_schema_version": "1.0",
|
||||
},
|
||||
}
|
||||
client.publish("infoscreen/discovery", json.dumps(discovery_msg))
|
||||
logging.info(f"Discovery message sent: {discovery_msg}")
|
||||
@@ -766,7 +1152,7 @@ def delete_client_settings():
|
||||
logging.error(f"Error deleting client settings: {e}")
|
||||
|
||||
|
||||
def publish_health_message(client, client_id):
|
||||
def publish_health_message(client, client_id, connection_state=None):
|
||||
"""Publish health status to server via MQTT"""
|
||||
try:
|
||||
health = read_health_state()
|
||||
@@ -784,6 +1170,17 @@ def publish_health_message(client, client_id):
|
||||
"status": health.get("process_status")
|
||||
}
|
||||
}
|
||||
|
||||
if connection_state is not None:
|
||||
last_disc = connection_state.get("last_disconnect")
|
||||
payload["broker_connection"] = {
|
||||
"broker_reachable": bool(connection_state.get("connected")),
|
||||
"reconnect_count": connection_state.get("reconnect_count", 0),
|
||||
"last_disconnect_at": (
|
||||
datetime.fromtimestamp(last_disc, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
if last_disc else None
|
||||
),
|
||||
}
|
||||
|
||||
topic = f"infoscreen/{client_id}/health"
|
||||
res = client.publish(topic, json.dumps(payload), qos=1)
|
||||
@@ -1007,6 +1404,10 @@ def main():
|
||||
power_intent_topic = None
|
||||
last_power_intent_id = None
|
||||
last_power_issued_at = None
|
||||
command_topic = f"infoscreen/{client_id}/commands"
|
||||
command_topic_alias = f"infoscreen/{client_id}/command"
|
||||
processed_commands = load_processed_commands()
|
||||
pending_recovery_command = load_last_command_state()
|
||||
|
||||
# paho-mqtt v2: opt into latest callback API to avoid deprecation warnings.
|
||||
client_kwargs = {"protocol": mqtt.MQTTv311}
|
||||
@@ -1018,12 +1419,18 @@ def main():
|
||||
pass
|
||||
client = mqtt.Client(**client_kwargs)
|
||||
client.on_message = on_message
|
||||
configure_mqtt_security(client)
|
||||
|
||||
# Enable automatic reconnection
|
||||
client.reconnect_delay_set(min_delay=1, max_delay=120)
|
||||
|
||||
# Connection state tracking
|
||||
connection_state = {"connected": False, "last_disconnect": None}
|
||||
connection_state = {
|
||||
"connected": False,
|
||||
"last_disconnect": None,
|
||||
"reconnect_count": 0,
|
||||
"connect_count": 0,
|
||||
}
|
||||
|
||||
# Optional: Enable MQTT debug logging in DEBUG_MODE
|
||||
if DEBUG_MODE:
|
||||
@@ -1090,6 +1497,9 @@ def main():
|
||||
if rc == 0:
|
||||
connection_state["connected"] = True
|
||||
connection_state["last_disconnect"] = None
|
||||
connection_state["connect_count"] = connection_state.get("connect_count", 0) + 1
|
||||
if connection_state["connect_count"] > 1:
|
||||
connection_state["reconnect_count"] = connection_state.get("reconnect_count", 0) + 1
|
||||
|
||||
# Check if this is a reconnection
|
||||
# paho-mqtt v2 provides ConnectFlags with attribute 'session_present'
|
||||
@@ -1121,6 +1531,11 @@ def main():
|
||||
group_id_topic = f"infoscreen/{client_id}/group_id"
|
||||
client.subscribe(group_id_topic)
|
||||
logging.info(f"Subscribed to: {group_id_topic}")
|
||||
|
||||
# Command topics (canonical + transitional)
|
||||
client.subscribe(command_topic, qos=1)
|
||||
client.subscribe(command_topic_alias, qos=1)
|
||||
logging.info(f"Subscribed to command topics: {command_topic}, {command_topic_alias}")
|
||||
|
||||
# Wenn beim Start eine group_id vorhanden ist, sofort Event-Topic abonnieren
|
||||
# Reset event_topic so subscribe_event_topic always re-registers with the broker
|
||||
@@ -1233,6 +1648,189 @@ def main():
|
||||
client.message_callback_add(group_id_topic, on_group_id_message)
|
||||
logging.info(f"Current group_id at start: {current_group_id if current_group_id else 'none'}")
|
||||
|
||||
def mark_command_processed(command_id, status, error_code=None):
|
||||
processed_commands[command_id] = {
|
||||
"status": status,
|
||||
"error_code": error_code,
|
||||
"processed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
}
|
||||
persist_processed_commands(processed_commands)
|
||||
|
||||
def execute_command_action(action):
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["sudo", COMMAND_HELPER_PATH, action],
|
||||
timeout=max(1, COMMAND_EXEC_TIMEOUT_SEC),
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "execution_timeout", f"command timed out after {COMMAND_EXEC_TIMEOUT_SEC}s"
|
||||
except PermissionError:
|
||||
return False, "permission_denied_local", "permission denied invoking command helper"
|
||||
except Exception as e:
|
||||
return False, "internal_error", f"internal execution error: {e}"
|
||||
|
||||
if proc.returncode != 0:
|
||||
stderr = (proc.stderr or "").strip()
|
||||
if proc.returncode in (126, 127):
|
||||
return False, "permission_denied_local", stderr or "command helper unavailable"
|
||||
return False, "execution_failed", stderr or f"helper exited with code {proc.returncode}"
|
||||
|
||||
return True, None, None
|
||||
|
||||
def on_command_message(client, userdata, msg, properties=None):
|
||||
payload_text = msg.payload.decode().strip()
|
||||
received_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
expires_at = None
|
||||
|
||||
if not payload_text:
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
NIL_COMMAND_ID,
|
||||
"failed",
|
||||
error_code="invalid_schema",
|
||||
error_message="empty command payload",
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
payload = json.loads(payload_text)
|
||||
except json.JSONDecodeError as e:
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
NIL_COMMAND_ID,
|
||||
"failed",
|
||||
error_code="invalid_schema",
|
||||
error_message=f"invalid JSON: {e}",
|
||||
)
|
||||
return
|
||||
|
||||
command_id_hint = _extract_command_id(payload)
|
||||
if isinstance(payload, dict):
|
||||
expires_at = payload.get("expires_at")
|
||||
|
||||
is_valid, normalized, error_code, error_message = validate_command_payload(payload, client_id)
|
||||
if not is_valid:
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
command_id_hint,
|
||||
"failed",
|
||||
error_code=error_code,
|
||||
error_message=error_message,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
return
|
||||
|
||||
command_id = normalized["command_id"]
|
||||
expires_at = normalized["expires_at"]
|
||||
action = normalized["action"]
|
||||
|
||||
if command_id in processed_commands:
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
command_id,
|
||||
"failed",
|
||||
error_code="duplicate_command",
|
||||
error_message="command_id already processed",
|
||||
expires_at=expires_at,
|
||||
)
|
||||
return
|
||||
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
command_id,
|
||||
"accepted",
|
||||
expires_at=expires_at,
|
||||
)
|
||||
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
command_id,
|
||||
"execution_started",
|
||||
expires_at=expires_at,
|
||||
)
|
||||
|
||||
write_last_command_state({
|
||||
"command_id": command_id,
|
||||
"action": action,
|
||||
"ack_status": "execution_started",
|
||||
"received_at": received_at,
|
||||
"execution_started_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"expires_at": expires_at,
|
||||
"source_topic": msg.topic,
|
||||
})
|
||||
|
||||
ok, exec_error_code, exec_error_message = execute_command_action(action)
|
||||
if ok:
|
||||
if command_requires_recovery_completion(action):
|
||||
if command_mock_reboot_immediate_complete_enabled(action):
|
||||
logging.info(
|
||||
"Mock reboot immediate completion enabled: command_id=%s action=%s",
|
||||
command_id,
|
||||
action,
|
||||
)
|
||||
else:
|
||||
logging.info(
|
||||
"Command entered recovery completion path: command_id=%s action=%s",
|
||||
command_id,
|
||||
action,
|
||||
)
|
||||
return
|
||||
|
||||
logging.info(
|
||||
"Command continuing to immediate completion path: command_id=%s action=%s",
|
||||
command_id,
|
||||
action,
|
||||
)
|
||||
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
command_id,
|
||||
"completed",
|
||||
expires_at=expires_at,
|
||||
)
|
||||
mark_command_processed(command_id, "completed")
|
||||
write_last_command_state({
|
||||
"command_id": command_id,
|
||||
"action": action,
|
||||
"ack_status": "completed",
|
||||
"completed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"expires_at": expires_at,
|
||||
})
|
||||
return
|
||||
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
command_id,
|
||||
"failed",
|
||||
error_code=exec_error_code,
|
||||
error_message=exec_error_message,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
mark_command_processed(command_id, "failed", error_code=exec_error_code)
|
||||
write_last_command_state({
|
||||
"command_id": command_id,
|
||||
"action": action,
|
||||
"ack_status": "failed",
|
||||
"error_code": exec_error_code,
|
||||
"error_message": exec_error_message,
|
||||
"failed_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"expires_at": expires_at,
|
||||
})
|
||||
|
||||
client.message_callback_add(command_topic, on_command_message)
|
||||
client.message_callback_add(command_topic_alias, on_command_message)
|
||||
|
||||
def on_power_intent_message(client, userdata, msg, properties=None):
|
||||
nonlocal last_power_intent_id, last_power_issued_at
|
||||
|
||||
@@ -1396,7 +1994,8 @@ def main():
|
||||
# Heartbeat-Loop with connection state monitoring
|
||||
last_heartbeat = 0
|
||||
logging.info("Entering heartbeat loop (network loop already running in background thread)")
|
||||
|
||||
_sd_notify("READY=1") # tell systemd the process is fully initialised
|
||||
|
||||
while True:
|
||||
try:
|
||||
current_time = time.time()
|
||||
@@ -1416,7 +2015,31 @@ def main():
|
||||
if result.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||
logging.info("Heartbeat sent.")
|
||||
# Also send health and screenshot heartbeats
|
||||
publish_health_message(client, client_id)
|
||||
publish_health_message(client, client_id, connection_state)
|
||||
if (
|
||||
isinstance(pending_recovery_command, dict)
|
||||
and pending_recovery_command.get("ack_status") == "execution_started"
|
||||
and pending_recovery_command.get("action") == "reboot_host"
|
||||
and pending_recovery_command.get("command_id")
|
||||
):
|
||||
recovered_command_id = pending_recovery_command.get("command_id")
|
||||
recovered_expires = pending_recovery_command.get("expires_at")
|
||||
publish_command_ack(
|
||||
client,
|
||||
client_id,
|
||||
recovered_command_id,
|
||||
"completed",
|
||||
expires_at=recovered_expires,
|
||||
)
|
||||
mark_command_processed(recovered_command_id, "completed")
|
||||
write_last_command_state({
|
||||
"command_id": recovered_command_id,
|
||||
"action": pending_recovery_command.get("action"),
|
||||
"ack_status": "recovered",
|
||||
"recovered_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"expires_at": recovered_expires,
|
||||
})
|
||||
pending_recovery_command = None
|
||||
elif result.rc == mqtt.MQTT_ERR_NO_CONN:
|
||||
logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
|
||||
time.sleep(2)
|
||||
@@ -1424,7 +2047,7 @@ def main():
|
||||
retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
|
||||
if retry.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||
logging.info("Heartbeat sent after retry.")
|
||||
publish_health_message(client, client_id)
|
||||
publish_health_message(client, client_id, connection_state)
|
||||
else:
|
||||
logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
|
||||
else:
|
||||
@@ -1435,6 +2058,7 @@ def main():
|
||||
logging.debug("Skipping heartbeat - MQTT not connected (is_connected=False)")
|
||||
last_heartbeat = current_time
|
||||
|
||||
_sd_notify("WATCHDOG=1") # kick systemd watchdog each loop iteration
|
||||
time.sleep(5)
|
||||
except KeyboardInterrupt:
|
||||
logging.info("Shutting down gracefully...")
|
||||
|
||||
Reference in New Issue
Block a user