feat(client-monitoring): finalize client-side monitoring and UTC logging

- add process health bridge and monitoring flow between display_manager and simclient - publish health + warn/error log topics over MQTT - standardize log/payload/screenshot timestamps to UTC (Z) to avoid DST drift - improve video handling: python-vlc fullscreen enforcement and runtime PID reporting - update README and copilot instructions with monitoring architecture and troubleshooting - add Phase 3 monitoring implementation documentation - update gitignore for new runtime/log artifacts
2026-03-11 20:24:38 +01:00
parent 1c445f4ba7
commit 80e5ce98a0
6 changed files with 837 additions and 18 deletions
--- a/src/display_manager.py
+++ b/src/display_manager.py
@@ -23,6 +23,7 @@ from logging.handlers import RotatingFileHandler
 from typing import Optional, Dict, List, IO
 from urllib.parse import urlparse
 import requests
+import psutil
 import json as _json
 import threading
 import time as _time
@@ -67,12 +68,15 @@ os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)

 logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
-    format="%(asctime)s [%(levelname)s] %(message)s",
+    format="%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%dT%H:%M:%S",
    handlers=[
        RotatingFileHandler(LOG_PATH, maxBytes=2*1024*1024, backupCount=5),
        logging.StreamHandler()
    ]
 )
+# Force all logging timestamps to UTC (affects %(asctime)s in all formatters).
+logging.Formatter.converter = time.gmtime

 # Log CEC mode after logging is configured
 if ENV == "development":
@@ -82,6 +86,90 @@ elif CEC_ENABLED:
 else:
    logging.info("[CEC] HDMI-CEC disabled in configuration")

+# Setup monitoring logger (separate from main display_manager.log for health/crash tracking)
+MONITORING_LOG_PATH = os.path.join(os.path.dirname(__file__), "..", "logs", "monitoring.log")
+os.makedirs(os.path.dirname(MONITORING_LOG_PATH), exist_ok=True)
+monitoring_logger = logging.getLogger("monitoring")
+monitoring_logger.setLevel(getattr(logging, LOG_LEVEL.upper(), logging.INFO))
+monitoring_handler = RotatingFileHandler(MONITORING_LOG_PATH, maxBytes=5*1024*1024, backupCount=5)
+monitoring_handler.setFormatter(logging.Formatter("%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s", "%Y-%m-%dT%H:%M:%S"))
+monitoring_logger.addHandler(monitoring_handler)
+monitoring_logger.propagate = False  # Don't duplicate to main logger
+logging.info(f"Monitoring logger initialized: {MONITORING_LOG_PATH}")
+
+# Health state file (shared bridge between display_manager and simclient)
+HEALTH_STATE_FILE = os.path.join(os.path.dirname(__file__), "current_process_health.json")
+
+
+class ProcessHealthState:
+    """Track and persist process health state for monitoring integration"""
+    
+    def __init__(self):
+        self.event_id = None
+        self.event_type = None
+        self.process_name = None
+        self.process_pid = None
+        self.status = "stopped"  # running, crashed, starting, stopped
+        self.restart_count = 0
+        self.max_restarts = 3
+        self.last_update = datetime.now(timezone.utc).isoformat()
+    
+    def to_dict(self) -> Dict:
+        return {
+            "event_id": self.event_id,
+            "event_type": self.event_type,
+            "current_process": self.process_name,
+            "process_pid": self.process_pid,
+            "process_status": self.status,
+            "restart_count": self.restart_count,
+            "timestamp": datetime.now(timezone.utc).isoformat()
+        }
+    
+    def save(self):
+        """Persist health state to file for simclient to read"""
+        try:
+            with open(HEALTH_STATE_FILE, "w") as f:
+                json.dump(self.to_dict(), f, indent=2)
+        except Exception as e:
+            monitoring_logger.error(f"Failed to save health state: {e}")
+    
+    def update_running(self, event_id: str, event_type: str, process_name: str, pid: Optional[int]):
+        """Mark process as running"""
+        self.event_id = event_id
+        self.event_type = event_type
+        self.process_name = process_name
+        self.process_pid = pid
+        self.status = "running"
+        self.restart_count = 0
+        self.save()
+        monitoring_logger.info(f"Process started: event_id={event_id} event_type={event_type} process={process_name} pid={pid}")
+    
+    def update_crashed(self):
+        """Mark process as crashed"""
+        self.status = "crashed"
+        self.save()
+        monitoring_logger.warning(f"Process crashed: event_id={self.event_id} event_type={self.event_type} process={self.process_name} restart_count={self.restart_count}/{self.max_restarts}")
+    
+    def update_restart_attempt(self):
+        """Increment restart counter and check if max exceeded"""
+        self.restart_count += 1
+        self.save()
+        if self.restart_count <= self.max_restarts:
+            monitoring_logger.warning(f"Restarting process: attempt {self.restart_count}/{self.max_restarts} for {self.process_name}")
+        else:
+            monitoring_logger.error(f"Max restart attempts exceeded for {self.process_name}: event_id={self.event_id}")
+    
+    def update_stopped(self):
+        """Mark process as stopped (normal event end)"""
+        self.status = "stopped"
+        self.event_id = None
+        self.event_type = None
+        self.process_name = None
+        self.process_pid = None
+        self.restart_count = 0
+        self.save()
+        monitoring_logger.info("Process stopped (event ended or no active event)")
+

 class HDMICECController:
    """Controls HDMI-CEC to turn TV on/off automatically
@@ -327,7 +415,7 @@ class DisplayProcess:
        self.player = player
        self.event_type = event_type
        self.event_id = event_id
-        self.start_time = datetime.now()
+        self.start_time = datetime.now(timezone.utc)
        self.log_file = log_file
        self.log_path = log_path
        
@@ -473,6 +561,9 @@ class DisplayManager:
        self.last_file_mtime: Optional[float] = None
        self.running = True
        
+        # Initialize health state tracking for process monitoring
+        self.health = ProcessHealthState()
+        
        # Initialize HDMI-CEC controller
        self.cec = HDMICECController(
            enabled=CEC_ENABLED,
@@ -629,7 +720,9 @@ class DisplayManager:
            time.sleep(1)
            if self.current_process.is_running():
                self.current_process.terminate(force=True)
-                
+            
+            # Update health state to reflect process stopped
+            self.health.update_stopped()
            self.current_process = None
            self.current_event_data = None
        
@@ -852,6 +945,9 @@ class DisplayManager:
            event_id = self.get_event_identifier(event)
            logging.info(f"Presentation started with PID: {process.pid}")
            
+            # Update health state for monitoring
+            self.health.update_running(event_id, 'presentation', 'impressive', process.pid)
+            
            return DisplayProcess(process, 'presentation', event_id, log_file=impressive_log, log_path=impressive_log_path)
            
        except Exception as e:
@@ -904,6 +1000,8 @@ class DisplayManager:
                    )
                    event_id = self.get_event_identifier(event)
                    logging.info(f"Video started with PID: {process.pid} (external vlc)")
+                    # Update health state for monitoring
+                    self.health.update_running(event_id, 'video', 'vlc', process.pid)
                    return DisplayProcess(process=process, event_type='video', event_id=event_id, log_file=video_log, log_path=video_log_path)
                else:
                    logging.error("No video player found (python-vlc or vlc binary)")
@@ -911,7 +1009,24 @@ class DisplayManager:

            # Use libvlc via python-vlc
            try:
-                instance = vlc.Instance()
+                # Keep python-vlc behavior aligned with external vlc fullscreen mode.
+                instance = vlc.Instance('--fullscreen', '--no-video-title-show')
+
+                def _force_fullscreen(player_obj, label: str):
+                    """Retry fullscreen toggle because video outputs may attach asynchronously."""
+                    if not player_obj:
+                        return
+
+                    def _worker():
+                        for delay in (0.0, 0.4, 1.0):
+                            if delay > 0:
+                                time.sleep(delay)
+                            try:
+                                player_obj.set_fullscreen(True)
+                            except Exception as e:
+                                logging.debug(f"Could not force fullscreen for {label}: {e}")
+
+                    threading.Thread(target=_worker, daemon=True).start()

                autoplay = bool(video.get('autoplay', True))
                loop_flag = bool(video.get('loop', False))
@@ -957,10 +1072,14 @@ class DisplayManager:
                    if autoplay:
                        try:
                            mlp.play()
+                            _force_fullscreen(mp, 'python-vlc MediaListPlayer')
                        except Exception as e:
                            logging.error(f"Failed to play media list: {e}")
                    event_id = self.get_event_identifier(event)
-                    logging.info("Video started via python-vlc (MediaListPlayer)" )
+                    runtime_pid = os.getpid()
+                    logging.info(f"Video started via python-vlc (MediaListPlayer), runtime PID: {runtime_pid}")
+                    # python-vlc runs in-process (no external vlc child), so publish this process PID.
+                    self.health.update_running(event_id, 'video', 'vlc', runtime_pid)
                    return DisplayProcess(process=None, event_type='video', event_id=event_id, log_file=None, log_path=None, player=mlp)

                else:
@@ -976,11 +1095,15 @@ class DisplayManager:
                    if autoplay:
                        try:
                            mp.play()
+                            _force_fullscreen(mp, 'python-vlc MediaPlayer')
                        except Exception as e:
                            logging.error(f"Failed to start media player: {e}")

                    event_id = self.get_event_identifier(event)
-                    logging.info("Video started via python-vlc (MediaPlayer)")
+                    runtime_pid = os.getpid()
+                    logging.info(f"Video started via python-vlc (MediaPlayer), runtime PID: {runtime_pid}")
+                    # python-vlc runs in-process (no external vlc child), so publish this process PID.
+                    self.health.update_running(event_id, 'video', 'vlc', runtime_pid)
                    return DisplayProcess(process=None, event_type='video', event_id=event_id, log_file=None, log_path=None, player=mp)

            except Exception as e:
@@ -1137,6 +1260,10 @@ class DisplayManager:
            
            event_id = self.get_event_identifier(event)
            logging.info(f"Webpage started with PID: {process.pid}")
+            
+            # Update health state for monitoring (track chromium browser)
+            browser_name = 'chromium-browser' if self._command_exists('chromium-browser') else 'chromium'
+            self.health.update_running(event_id, etype, browser_name, process.pid)

            # Inject auto-scroll JS via Chrome DevTools Protocol (CDP) if enabled and available
            if autoscroll_enabled:
@@ -1367,9 +1494,20 @@ class DisplayManager:
                    if self.current_process.event_type == 'presentation' and exit_code == 0:
                        logging.info("Presentation process ended with exit code 0 (likely normal completion).")
                        self.current_process = None
+                        # Update health state to show normal completion
+                        self.health.update_stopped()
                        # Don't turn off TV yet - event might still be active
                        return

+                    # Process crashed unexpectedly
+                    self.health.update_crashed()
+                    self.health.update_restart_attempt()
+                    
+                    if self.health.restart_count > self.health.max_restarts:
+                        logging.error(f"Max restart attempts exceeded - giving up on {self.health.process_name}")
+                        self.current_process = None
+                        return
+                    
                    logging.info("Restarting display process...")
                    self.current_process = None
                    # Don't turn off TV when restarting same event
@@ -1413,10 +1551,8 @@ class DisplayManager:
        
        # Log timezone information for debugging
        now_utc = datetime.now(timezone.utc)
-        now_local = datetime.now()
        logging.info(f"Current time (UTC): {now_utc.strftime('%Y-%m-%d %H:%M:%S %Z')}")
-        logging.info(f"Current time (Local): {now_local.strftime('%Y-%m-%d %H:%M:%S')}")
-        logging.info("Note: Event times are expected in UTC format")
+        logging.info("Event times are expected in UTC format")
        
        while self.running:
            try:
@@ -1469,7 +1605,7 @@ class DisplayManager:
        Maintains a 'latest.jpg' file and rotates older screenshots beyond SCREENSHOT_MAX_FILES.
        """
        try:
-            ts = datetime.now().strftime('%Y%m%d_%H%M%S')
+            ts = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
            raw_path = os.path.join(self.screenshot_dir, f'screenshot_{ts}.png')
            final_path = os.path.join(self.screenshot_dir, f'screenshot_{ts}.jpg')

--- a/src/simclient.py
+++ b/src/simclient.py
@@ -15,7 +15,7 @@ import logging
 from dotenv import load_dotenv
 import requests
 import base64
-from datetime import datetime
+from datetime import datetime, timezone
 import threading
 from urllib.parse import urlsplit, urlunsplit, unquote

@@ -123,9 +123,26 @@ if DEBUG_MODE:
    log_handlers.append(logging.StreamHandler())
 logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
-    format="%(asctime)s [%(levelname)s] %(message)s",
+    format="%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%dT%H:%M:%S",
    handlers=log_handlers
 )
+# Force all logging timestamps to UTC (affects %(asctime)s in all formatters).
+logging.Formatter.converter = time.gmtime
+
+# Setup monitoring logger (separate for health/crash events, local rotation only)
+MONITORING_LOG_PATH = os.path.join(os.path.dirname(__file__), "..", "logs", "monitoring.log")
+os.makedirs(os.path.dirname(MONITORING_LOG_PATH), exist_ok=True)
+monitoring_logger = logging.getLogger("monitoring")
+monitoring_logger.setLevel(getattr(logging, LOG_LEVEL.upper(), logging.INFO))
+monitoring_handler = RotatingFileHandler(MONITORING_LOG_PATH, maxBytes=5*1024*1024, backupCount=5)
+monitoring_handler.setFormatter(logging.Formatter("%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s", "%Y-%m-%dT%H:%M:%S"))
+monitoring_logger.addHandler(monitoring_handler)
+monitoring_logger.propagate = False  # Don't duplicate to main logger
+logging.info(f"Monitoring logger initialized: {MONITORING_LOG_PATH}")
+
+# Health state file (written by display_manager, read by simclient)
+HEALTH_STATE_FILE = os.path.join(os.path.dirname(__file__), "current_process_health.json")


 discovered = False
@@ -485,7 +502,7 @@ def get_latest_screenshot():
                return {
                    "filename": os.path.basename(preferred_path),
                    "data": screenshot_data,
-                    "timestamp": datetime.fromtimestamp(file_stats.st_mtime).isoformat(),
+                    "timestamp": datetime.fromtimestamp(file_stats.st_mtime, tz=timezone.utc).isoformat(),
                    "size": file_stats.st_size
                }
            except Exception as e:
@@ -514,7 +531,7 @@ def get_latest_screenshot():
        info = {
            "filename": latest_file,
            "data": screenshot_data,
-            "timestamp": datetime.fromtimestamp(file_stats.st_mtime).isoformat(),
+            "timestamp": datetime.fromtimestamp(file_stats.st_mtime, tz=timezone.utc).isoformat(),
            "size": file_stats.st_size
        }
        logging.debug(f"Selected latest screenshot: {latest_file} ({file_stats.st_size} bytes)")
@@ -525,13 +542,82 @@ def get_latest_screenshot():
        return None


+def read_health_state():
+    """Read the health state file written by display_manager"""
+    try:
+        if not os.path.exists(HEALTH_STATE_FILE):
+            return None
+        with open(HEALTH_STATE_FILE, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        logging.debug(f"Could not read health state file: {e}")
+        return None
+
+
+def publish_health_message(client, client_id):
+    """Publish health status to server via MQTT"""
+    try:
+        health = read_health_state()
+        if not health:
+            return  # No active process
+        
+        payload = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "expected_state": {
+                "event_id": health.get("event_id")
+            },
+            "actual_state": {
+                "process": health.get("current_process"),
+                "pid": health.get("process_pid"),
+                "status": health.get("process_status")
+            }
+        }
+        
+        topic = f"infoscreen/{client_id}/health"
+        res = client.publish(topic, json.dumps(payload), qos=1)
+        if res.rc == mqtt.MQTT_ERR_SUCCESS:
+            logging.debug(f"Health message published: {health.get('current_process')} status={health.get('process_status')}")
+        else:
+            logging.debug(f"Health publish failed with code: {res.rc}")
+    except Exception as e:
+        logging.debug(f"Error publishing health: {e}")
+
+
+def publish_log_message(client, client_id, level: str, message: str, context: dict = None):
+    """Publish log message to server via MQTT (only if level is ERROR or WARN, unless DEBUG_MODE)"""
+    try:
+        # Filter logs: only send ERROR/WARN to server, keep INFO/DEBUG local-only unless DEBUG_MODE
+        if level.upper() == "INFO" and not DEBUG_MODE:
+            return  # Keep INFO logs local only in production
+        if level.upper() == "DEBUG":
+            return  # DEBUG logs always local-only
+        
+        topic = f"infoscreen/{client_id}/logs/{level.lower()}"
+        payload = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "message": message,
+            "context": context or {}
+        }
+        
+        res = client.publish(topic, json.dumps(payload), qos=1)
+        if res.rc == mqtt.MQTT_ERR_SUCCESS:
+            monitoring_logger.log(getattr(logging, level.upper(), logging.INFO), f"[MQTT] {message}")
+        else:
+            logging.debug(f"Log publish failed ({level}) with code: {res.rc}")
+    except Exception as e:
+        logging.debug(f"Error publishing log: {e}")
+
+
 def send_screenshot_heartbeat(client, client_id):
    """Send heartbeat with screenshot to server for dashboard monitoring"""
    try:
        screenshot_info = get_latest_screenshot()
        
+        # Also read health state and include in heartbeat
+        health = read_health_state()
+        
        heartbeat_data = {
-            "timestamp": datetime.now().isoformat(),
+            "timestamp": datetime.now(timezone.utc).isoformat(),
            "client_id": client_id,
            "status": "alive",
            "screenshot": screenshot_info,
@@ -542,6 +628,17 @@ def send_screenshot_heartbeat(client, client_id):
            }
        }
        
+        # Include health info if available (from display_manager)
+        if health:
+            heartbeat_data["process_health"] = {
+                "event_id": health.get("event_id"),
+                "event_type": health.get("event_type"),
+                "current_process": health.get("current_process"),
+                "process_pid": health.get("process_pid"),
+                "process_status": health.get("process_status"),
+                "restart_count": health.get("restart_count", 0)
+            }
+        
        # Send to dashboard monitoring topic
        dashboard_topic = f"infoscreen/{client_id}/dashboard"
        payload = json.dumps(heartbeat_data)
@@ -575,7 +672,7 @@ def screenshot_service_thread(client, client_id):

 def main():
    global discovered
-    print(f"[{datetime.now().isoformat()}] simclient.py: program started")
+    print(f"[{datetime.now(timezone.utc).isoformat()}] simclient.py: program started")
    logging.info("Client starting - deleting old event file if present")
    delete_event_file()

@@ -840,6 +937,8 @@ def main():
                    result = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
                    if result.rc == mqtt.MQTT_ERR_SUCCESS:
                        logging.info("Heartbeat sent.")
+                        # Also send health and screenshot heartbeats
+                        publish_health_message(client, client_id)
                    elif result.rc == mqtt.MQTT_ERR_NO_CONN:
                        logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
                        time.sleep(2)
@@ -847,6 +946,7 @@ def main():
                            retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
                            if retry.rc == mqtt.MQTT_ERR_SUCCESS:
                                logging.info("Heartbeat sent after retry.")
+                                publish_health_message(client, client_id)
                            else:
                                logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
                        else: