feat(client-monitoring): finalize client-side monitoring and UTC logging

- add process health bridge and monitoring flow between display_manager and simclient
- publish health + warn/error log topics over MQTT
- standardize log/payload/screenshot timestamps to UTC (Z) to avoid DST drift
- improve video handling: python-vlc fullscreen enforcement and runtime PID reporting
- update README and copilot instructions with monitoring architecture and troubleshooting
- add Phase 3 monitoring implementation documentation
- update gitignore for new runtime/log artifacts
This commit is contained in:
RobbStarkAustria
2026-03-11 20:24:38 +01:00
parent 1c445f4ba7
commit 80e5ce98a0
6 changed files with 837 additions and 18 deletions

View File

@@ -15,7 +15,7 @@ import logging
from dotenv import load_dotenv
import requests
import base64
from datetime import datetime
from datetime import datetime, timezone
import threading
from urllib.parse import urlsplit, urlunsplit, unquote
@@ -123,9 +123,26 @@ if DEBUG_MODE:
log_handlers.append(logging.StreamHandler())
logging.basicConfig(
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
format="%(asctime)s [%(levelname)s] %(message)s",
format="%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
handlers=log_handlers
)
# Force all logging timestamps to UTC (affects %(asctime)s in all formatters).
logging.Formatter.converter = time.gmtime
# Setup monitoring logger (separate for health/crash events, local rotation only)
MONITORING_LOG_PATH = os.path.join(os.path.dirname(__file__), "..", "logs", "monitoring.log")
os.makedirs(os.path.dirname(MONITORING_LOG_PATH), exist_ok=True)
monitoring_logger = logging.getLogger("monitoring")
monitoring_logger.setLevel(getattr(logging, LOG_LEVEL.upper(), logging.INFO))
monitoring_handler = RotatingFileHandler(MONITORING_LOG_PATH, maxBytes=5*1024*1024, backupCount=5)
monitoring_handler.setFormatter(logging.Formatter("%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s", "%Y-%m-%dT%H:%M:%S"))
monitoring_logger.addHandler(monitoring_handler)
monitoring_logger.propagate = False # Don't duplicate to main logger
logging.info(f"Monitoring logger initialized: {MONITORING_LOG_PATH}")
# Health state file (written by display_manager, read by simclient)
HEALTH_STATE_FILE = os.path.join(os.path.dirname(__file__), "current_process_health.json")
discovered = False
@@ -485,7 +502,7 @@ def get_latest_screenshot():
return {
"filename": os.path.basename(preferred_path),
"data": screenshot_data,
"timestamp": datetime.fromtimestamp(file_stats.st_mtime).isoformat(),
"timestamp": datetime.fromtimestamp(file_stats.st_mtime, tz=timezone.utc).isoformat(),
"size": file_stats.st_size
}
except Exception as e:
@@ -514,7 +531,7 @@ def get_latest_screenshot():
info = {
"filename": latest_file,
"data": screenshot_data,
"timestamp": datetime.fromtimestamp(file_stats.st_mtime).isoformat(),
"timestamp": datetime.fromtimestamp(file_stats.st_mtime, tz=timezone.utc).isoformat(),
"size": file_stats.st_size
}
logging.debug(f"Selected latest screenshot: {latest_file} ({file_stats.st_size} bytes)")
@@ -525,13 +542,82 @@ def get_latest_screenshot():
return None
def read_health_state():
"""Read the health state file written by display_manager"""
try:
if not os.path.exists(HEALTH_STATE_FILE):
return None
with open(HEALTH_STATE_FILE, 'r') as f:
return json.load(f)
except Exception as e:
logging.debug(f"Could not read health state file: {e}")
return None
def publish_health_message(client, client_id):
"""Publish health status to server via MQTT"""
try:
health = read_health_state()
if not health:
return # No active process
payload = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"expected_state": {
"event_id": health.get("event_id")
},
"actual_state": {
"process": health.get("current_process"),
"pid": health.get("process_pid"),
"status": health.get("process_status")
}
}
topic = f"infoscreen/{client_id}/health"
res = client.publish(topic, json.dumps(payload), qos=1)
if res.rc == mqtt.MQTT_ERR_SUCCESS:
logging.debug(f"Health message published: {health.get('current_process')} status={health.get('process_status')}")
else:
logging.debug(f"Health publish failed with code: {res.rc}")
except Exception as e:
logging.debug(f"Error publishing health: {e}")
def publish_log_message(client, client_id, level: str, message: str, context: dict = None):
"""Publish log message to server via MQTT (only if level is ERROR or WARN, unless DEBUG_MODE)"""
try:
# Filter logs: only send ERROR/WARN to server, keep INFO/DEBUG local-only unless DEBUG_MODE
if level.upper() == "INFO" and not DEBUG_MODE:
return # Keep INFO logs local only in production
if level.upper() == "DEBUG":
return # DEBUG logs always local-only
topic = f"infoscreen/{client_id}/logs/{level.lower()}"
payload = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"message": message,
"context": context or {}
}
res = client.publish(topic, json.dumps(payload), qos=1)
if res.rc == mqtt.MQTT_ERR_SUCCESS:
monitoring_logger.log(getattr(logging, level.upper(), logging.INFO), f"[MQTT] {message}")
else:
logging.debug(f"Log publish failed ({level}) with code: {res.rc}")
except Exception as e:
logging.debug(f"Error publishing log: {e}")
def send_screenshot_heartbeat(client, client_id):
"""Send heartbeat with screenshot to server for dashboard monitoring"""
try:
screenshot_info = get_latest_screenshot()
# Also read health state and include in heartbeat
health = read_health_state()
heartbeat_data = {
"timestamp": datetime.now().isoformat(),
"timestamp": datetime.now(timezone.utc).isoformat(),
"client_id": client_id,
"status": "alive",
"screenshot": screenshot_info,
@@ -542,6 +628,17 @@ def send_screenshot_heartbeat(client, client_id):
}
}
# Include health info if available (from display_manager)
if health:
heartbeat_data["process_health"] = {
"event_id": health.get("event_id"),
"event_type": health.get("event_type"),
"current_process": health.get("current_process"),
"process_pid": health.get("process_pid"),
"process_status": health.get("process_status"),
"restart_count": health.get("restart_count", 0)
}
# Send to dashboard monitoring topic
dashboard_topic = f"infoscreen/{client_id}/dashboard"
payload = json.dumps(heartbeat_data)
@@ -575,7 +672,7 @@ def screenshot_service_thread(client, client_id):
def main():
global discovered
print(f"[{datetime.now().isoformat()}] simclient.py: program started")
print(f"[{datetime.now(timezone.utc).isoformat()}] simclient.py: program started")
logging.info("Client starting - deleting old event file if present")
delete_event_file()
@@ -840,6 +937,8 @@ def main():
result = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
if result.rc == mqtt.MQTT_ERR_SUCCESS:
logging.info("Heartbeat sent.")
# Also send health and screenshot heartbeats
publish_health_message(client, client_id)
elif result.rc == mqtt.MQTT_ERR_NO_CONN:
logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
time.sleep(2)
@@ -847,6 +946,7 @@ def main():
retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
if retry.rc == mqtt.MQTT_ERR_SUCCESS:
logging.info("Heartbeat sent after retry.")
publish_health_message(client, client_id)
else:
logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
else: