feat(client-monitoring): finalize client-side monitoring and UTC logging
- add process health bridge and monitoring flow between display_manager and simclient - publish health + warn/error log topics over MQTT - standardize log/payload/screenshot timestamps to UTC (Z) to avoid DST drift - improve video handling: python-vlc fullscreen enforcement and runtime PID reporting - update README and copilot instructions with monitoring architecture and troubleshooting - add Phase 3 monitoring implementation documentation - update gitignore for new runtime/log artifacts
This commit is contained in:
112
src/simclient.py
112
src/simclient.py
@@ -15,7 +15,7 @@ import logging
|
||||
from dotenv import load_dotenv
|
||||
import requests
|
||||
import base64
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
import threading
|
||||
from urllib.parse import urlsplit, urlunsplit, unquote
|
||||
|
||||
@@ -123,9 +123,26 @@ if DEBUG_MODE:
|
||||
log_handlers.append(logging.StreamHandler())
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
format="%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||
handlers=log_handlers
|
||||
)
|
||||
# Force all logging timestamps to UTC (affects %(asctime)s in all formatters).
|
||||
logging.Formatter.converter = time.gmtime
|
||||
|
||||
# Setup monitoring logger (separate for health/crash events, local rotation only)
|
||||
MONITORING_LOG_PATH = os.path.join(os.path.dirname(__file__), "..", "logs", "monitoring.log")
|
||||
os.makedirs(os.path.dirname(MONITORING_LOG_PATH), exist_ok=True)
|
||||
monitoring_logger = logging.getLogger("monitoring")
|
||||
monitoring_logger.setLevel(getattr(logging, LOG_LEVEL.upper(), logging.INFO))
|
||||
monitoring_handler = RotatingFileHandler(MONITORING_LOG_PATH, maxBytes=5*1024*1024, backupCount=5)
|
||||
monitoring_handler.setFormatter(logging.Formatter("%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s", "%Y-%m-%dT%H:%M:%S"))
|
||||
monitoring_logger.addHandler(monitoring_handler)
|
||||
monitoring_logger.propagate = False # Don't duplicate to main logger
|
||||
logging.info(f"Monitoring logger initialized: {MONITORING_LOG_PATH}")
|
||||
|
||||
# Health state file (written by display_manager, read by simclient)
|
||||
HEALTH_STATE_FILE = os.path.join(os.path.dirname(__file__), "current_process_health.json")
|
||||
|
||||
|
||||
discovered = False
|
||||
@@ -485,7 +502,7 @@ def get_latest_screenshot():
|
||||
return {
|
||||
"filename": os.path.basename(preferred_path),
|
||||
"data": screenshot_data,
|
||||
"timestamp": datetime.fromtimestamp(file_stats.st_mtime).isoformat(),
|
||||
"timestamp": datetime.fromtimestamp(file_stats.st_mtime, tz=timezone.utc).isoformat(),
|
||||
"size": file_stats.st_size
|
||||
}
|
||||
except Exception as e:
|
||||
@@ -514,7 +531,7 @@ def get_latest_screenshot():
|
||||
info = {
|
||||
"filename": latest_file,
|
||||
"data": screenshot_data,
|
||||
"timestamp": datetime.fromtimestamp(file_stats.st_mtime).isoformat(),
|
||||
"timestamp": datetime.fromtimestamp(file_stats.st_mtime, tz=timezone.utc).isoformat(),
|
||||
"size": file_stats.st_size
|
||||
}
|
||||
logging.debug(f"Selected latest screenshot: {latest_file} ({file_stats.st_size} bytes)")
|
||||
@@ -525,13 +542,82 @@ def get_latest_screenshot():
|
||||
return None
|
||||
|
||||
|
||||
def read_health_state():
|
||||
"""Read the health state file written by display_manager"""
|
||||
try:
|
||||
if not os.path.exists(HEALTH_STATE_FILE):
|
||||
return None
|
||||
with open(HEALTH_STATE_FILE, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logging.debug(f"Could not read health state file: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def publish_health_message(client, client_id):
|
||||
"""Publish health status to server via MQTT"""
|
||||
try:
|
||||
health = read_health_state()
|
||||
if not health:
|
||||
return # No active process
|
||||
|
||||
payload = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"expected_state": {
|
||||
"event_id": health.get("event_id")
|
||||
},
|
||||
"actual_state": {
|
||||
"process": health.get("current_process"),
|
||||
"pid": health.get("process_pid"),
|
||||
"status": health.get("process_status")
|
||||
}
|
||||
}
|
||||
|
||||
topic = f"infoscreen/{client_id}/health"
|
||||
res = client.publish(topic, json.dumps(payload), qos=1)
|
||||
if res.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||
logging.debug(f"Health message published: {health.get('current_process')} status={health.get('process_status')}")
|
||||
else:
|
||||
logging.debug(f"Health publish failed with code: {res.rc}")
|
||||
except Exception as e:
|
||||
logging.debug(f"Error publishing health: {e}")
|
||||
|
||||
|
||||
def publish_log_message(client, client_id, level: str, message: str, context: dict = None):
|
||||
"""Publish log message to server via MQTT (only if level is ERROR or WARN, unless DEBUG_MODE)"""
|
||||
try:
|
||||
# Filter logs: only send ERROR/WARN to server, keep INFO/DEBUG local-only unless DEBUG_MODE
|
||||
if level.upper() == "INFO" and not DEBUG_MODE:
|
||||
return # Keep INFO logs local only in production
|
||||
if level.upper() == "DEBUG":
|
||||
return # DEBUG logs always local-only
|
||||
|
||||
topic = f"infoscreen/{client_id}/logs/{level.lower()}"
|
||||
payload = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"message": message,
|
||||
"context": context or {}
|
||||
}
|
||||
|
||||
res = client.publish(topic, json.dumps(payload), qos=1)
|
||||
if res.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||
monitoring_logger.log(getattr(logging, level.upper(), logging.INFO), f"[MQTT] {message}")
|
||||
else:
|
||||
logging.debug(f"Log publish failed ({level}) with code: {res.rc}")
|
||||
except Exception as e:
|
||||
logging.debug(f"Error publishing log: {e}")
|
||||
|
||||
|
||||
def send_screenshot_heartbeat(client, client_id):
|
||||
"""Send heartbeat with screenshot to server for dashboard monitoring"""
|
||||
try:
|
||||
screenshot_info = get_latest_screenshot()
|
||||
|
||||
# Also read health state and include in heartbeat
|
||||
health = read_health_state()
|
||||
|
||||
heartbeat_data = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"client_id": client_id,
|
||||
"status": "alive",
|
||||
"screenshot": screenshot_info,
|
||||
@@ -542,6 +628,17 @@ def send_screenshot_heartbeat(client, client_id):
|
||||
}
|
||||
}
|
||||
|
||||
# Include health info if available (from display_manager)
|
||||
if health:
|
||||
heartbeat_data["process_health"] = {
|
||||
"event_id": health.get("event_id"),
|
||||
"event_type": health.get("event_type"),
|
||||
"current_process": health.get("current_process"),
|
||||
"process_pid": health.get("process_pid"),
|
||||
"process_status": health.get("process_status"),
|
||||
"restart_count": health.get("restart_count", 0)
|
||||
}
|
||||
|
||||
# Send to dashboard monitoring topic
|
||||
dashboard_topic = f"infoscreen/{client_id}/dashboard"
|
||||
payload = json.dumps(heartbeat_data)
|
||||
@@ -575,7 +672,7 @@ def screenshot_service_thread(client, client_id):
|
||||
|
||||
def main():
|
||||
global discovered
|
||||
print(f"[{datetime.now().isoformat()}] simclient.py: program started")
|
||||
print(f"[{datetime.now(timezone.utc).isoformat()}] simclient.py: program started")
|
||||
logging.info("Client starting - deleting old event file if present")
|
||||
delete_event_file()
|
||||
|
||||
@@ -840,6 +937,8 @@ def main():
|
||||
result = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
|
||||
if result.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||
logging.info("Heartbeat sent.")
|
||||
# Also send health and screenshot heartbeats
|
||||
publish_health_message(client, client_id)
|
||||
elif result.rc == mqtt.MQTT_ERR_NO_CONN:
|
||||
logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
|
||||
time.sleep(2)
|
||||
@@ -847,6 +946,7 @@ def main():
|
||||
retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
|
||||
if retry.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||
logging.info("Heartbeat sent after retry.")
|
||||
publish_health_message(client, client_id)
|
||||
else:
|
||||
logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user