feat(client-monitoring): finalize client-side monitoring and UTC logging
- add process health bridge and monitoring flow between display_manager and simclient - publish health + warn/error log topics over MQTT - standardize log/payload/screenshot timestamps to UTC (Z) to avoid DST drift - improve video handling: python-vlc fullscreen enforcement and runtime PID reporting - update README and copilot instructions with monitoring architecture and troubleshooting - add Phase 3 monitoring implementation documentation - update gitignore for new runtime/log artifacts
This commit is contained in:
@@ -23,6 +23,7 @@ from logging.handlers import RotatingFileHandler
|
||||
from typing import Optional, Dict, List, IO
|
||||
from urllib.parse import urlparse
|
||||
import requests
|
||||
import psutil
|
||||
import json as _json
|
||||
import threading
|
||||
import time as _time
|
||||
@@ -67,12 +68,15 @@ os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
format="%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||
handlers=[
|
||||
RotatingFileHandler(LOG_PATH, maxBytes=2*1024*1024, backupCount=5),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
# Force all logging timestamps to UTC (affects %(asctime)s in all formatters).
|
||||
logging.Formatter.converter = time.gmtime
|
||||
|
||||
# Log CEC mode after logging is configured
|
||||
if ENV == "development":
|
||||
@@ -82,6 +86,90 @@ elif CEC_ENABLED:
|
||||
else:
|
||||
logging.info("[CEC] HDMI-CEC disabled in configuration")
|
||||
|
||||
# Setup monitoring logger (separate from main display_manager.log for health/crash tracking)
|
||||
MONITORING_LOG_PATH = os.path.join(os.path.dirname(__file__), "..", "logs", "monitoring.log")
|
||||
os.makedirs(os.path.dirname(MONITORING_LOG_PATH), exist_ok=True)
|
||||
monitoring_logger = logging.getLogger("monitoring")
|
||||
monitoring_logger.setLevel(getattr(logging, LOG_LEVEL.upper(), logging.INFO))
|
||||
monitoring_handler = RotatingFileHandler(MONITORING_LOG_PATH, maxBytes=5*1024*1024, backupCount=5)
|
||||
monitoring_handler.setFormatter(logging.Formatter("%(asctime)s.%(msecs)03dZ [%(levelname)s] %(message)s", "%Y-%m-%dT%H:%M:%S"))
|
||||
monitoring_logger.addHandler(monitoring_handler)
|
||||
monitoring_logger.propagate = False # Don't duplicate to main logger
|
||||
logging.info(f"Monitoring logger initialized: {MONITORING_LOG_PATH}")
|
||||
|
||||
# Health state file (shared bridge between display_manager and simclient)
|
||||
HEALTH_STATE_FILE = os.path.join(os.path.dirname(__file__), "current_process_health.json")
|
||||
|
||||
|
||||
class ProcessHealthState:
|
||||
"""Track and persist process health state for monitoring integration"""
|
||||
|
||||
def __init__(self):
|
||||
self.event_id = None
|
||||
self.event_type = None
|
||||
self.process_name = None
|
||||
self.process_pid = None
|
||||
self.status = "stopped" # running, crashed, starting, stopped
|
||||
self.restart_count = 0
|
||||
self.max_restarts = 3
|
||||
self.last_update = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"event_id": self.event_id,
|
||||
"event_type": self.event_type,
|
||||
"current_process": self.process_name,
|
||||
"process_pid": self.process_pid,
|
||||
"process_status": self.status,
|
||||
"restart_count": self.restart_count,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat()
|
||||
}
|
||||
|
||||
def save(self):
|
||||
"""Persist health state to file for simclient to read"""
|
||||
try:
|
||||
with open(HEALTH_STATE_FILE, "w") as f:
|
||||
json.dump(self.to_dict(), f, indent=2)
|
||||
except Exception as e:
|
||||
monitoring_logger.error(f"Failed to save health state: {e}")
|
||||
|
||||
def update_running(self, event_id: str, event_type: str, process_name: str, pid: Optional[int]):
|
||||
"""Mark process as running"""
|
||||
self.event_id = event_id
|
||||
self.event_type = event_type
|
||||
self.process_name = process_name
|
||||
self.process_pid = pid
|
||||
self.status = "running"
|
||||
self.restart_count = 0
|
||||
self.save()
|
||||
monitoring_logger.info(f"Process started: event_id={event_id} event_type={event_type} process={process_name} pid={pid}")
|
||||
|
||||
def update_crashed(self):
|
||||
"""Mark process as crashed"""
|
||||
self.status = "crashed"
|
||||
self.save()
|
||||
monitoring_logger.warning(f"Process crashed: event_id={self.event_id} event_type={self.event_type} process={self.process_name} restart_count={self.restart_count}/{self.max_restarts}")
|
||||
|
||||
def update_restart_attempt(self):
|
||||
"""Increment restart counter and check if max exceeded"""
|
||||
self.restart_count += 1
|
||||
self.save()
|
||||
if self.restart_count <= self.max_restarts:
|
||||
monitoring_logger.warning(f"Restarting process: attempt {self.restart_count}/{self.max_restarts} for {self.process_name}")
|
||||
else:
|
||||
monitoring_logger.error(f"Max restart attempts exceeded for {self.process_name}: event_id={self.event_id}")
|
||||
|
||||
def update_stopped(self):
|
||||
"""Mark process as stopped (normal event end)"""
|
||||
self.status = "stopped"
|
||||
self.event_id = None
|
||||
self.event_type = None
|
||||
self.process_name = None
|
||||
self.process_pid = None
|
||||
self.restart_count = 0
|
||||
self.save()
|
||||
monitoring_logger.info("Process stopped (event ended or no active event)")
|
||||
|
||||
|
||||
class HDMICECController:
|
||||
"""Controls HDMI-CEC to turn TV on/off automatically
|
||||
@@ -327,7 +415,7 @@ class DisplayProcess:
|
||||
self.player = player
|
||||
self.event_type = event_type
|
||||
self.event_id = event_id
|
||||
self.start_time = datetime.now()
|
||||
self.start_time = datetime.now(timezone.utc)
|
||||
self.log_file = log_file
|
||||
self.log_path = log_path
|
||||
|
||||
@@ -473,6 +561,9 @@ class DisplayManager:
|
||||
self.last_file_mtime: Optional[float] = None
|
||||
self.running = True
|
||||
|
||||
# Initialize health state tracking for process monitoring
|
||||
self.health = ProcessHealthState()
|
||||
|
||||
# Initialize HDMI-CEC controller
|
||||
self.cec = HDMICECController(
|
||||
enabled=CEC_ENABLED,
|
||||
@@ -629,7 +720,9 @@ class DisplayManager:
|
||||
time.sleep(1)
|
||||
if self.current_process.is_running():
|
||||
self.current_process.terminate(force=True)
|
||||
|
||||
|
||||
# Update health state to reflect process stopped
|
||||
self.health.update_stopped()
|
||||
self.current_process = None
|
||||
self.current_event_data = None
|
||||
|
||||
@@ -852,6 +945,9 @@ class DisplayManager:
|
||||
event_id = self.get_event_identifier(event)
|
||||
logging.info(f"Presentation started with PID: {process.pid}")
|
||||
|
||||
# Update health state for monitoring
|
||||
self.health.update_running(event_id, 'presentation', 'impressive', process.pid)
|
||||
|
||||
return DisplayProcess(process, 'presentation', event_id, log_file=impressive_log, log_path=impressive_log_path)
|
||||
|
||||
except Exception as e:
|
||||
@@ -904,6 +1000,8 @@ class DisplayManager:
|
||||
)
|
||||
event_id = self.get_event_identifier(event)
|
||||
logging.info(f"Video started with PID: {process.pid} (external vlc)")
|
||||
# Update health state for monitoring
|
||||
self.health.update_running(event_id, 'video', 'vlc', process.pid)
|
||||
return DisplayProcess(process=process, event_type='video', event_id=event_id, log_file=video_log, log_path=video_log_path)
|
||||
else:
|
||||
logging.error("No video player found (python-vlc or vlc binary)")
|
||||
@@ -911,7 +1009,24 @@ class DisplayManager:
|
||||
|
||||
# Use libvlc via python-vlc
|
||||
try:
|
||||
instance = vlc.Instance()
|
||||
# Keep python-vlc behavior aligned with external vlc fullscreen mode.
|
||||
instance = vlc.Instance('--fullscreen', '--no-video-title-show')
|
||||
|
||||
def _force_fullscreen(player_obj, label: str):
|
||||
"""Retry fullscreen toggle because video outputs may attach asynchronously."""
|
||||
if not player_obj:
|
||||
return
|
||||
|
||||
def _worker():
|
||||
for delay in (0.0, 0.4, 1.0):
|
||||
if delay > 0:
|
||||
time.sleep(delay)
|
||||
try:
|
||||
player_obj.set_fullscreen(True)
|
||||
except Exception as e:
|
||||
logging.debug(f"Could not force fullscreen for {label}: {e}")
|
||||
|
||||
threading.Thread(target=_worker, daemon=True).start()
|
||||
|
||||
autoplay = bool(video.get('autoplay', True))
|
||||
loop_flag = bool(video.get('loop', False))
|
||||
@@ -957,10 +1072,14 @@ class DisplayManager:
|
||||
if autoplay:
|
||||
try:
|
||||
mlp.play()
|
||||
_force_fullscreen(mp, 'python-vlc MediaListPlayer')
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to play media list: {e}")
|
||||
event_id = self.get_event_identifier(event)
|
||||
logging.info("Video started via python-vlc (MediaListPlayer)" )
|
||||
runtime_pid = os.getpid()
|
||||
logging.info(f"Video started via python-vlc (MediaListPlayer), runtime PID: {runtime_pid}")
|
||||
# python-vlc runs in-process (no external vlc child), so publish this process PID.
|
||||
self.health.update_running(event_id, 'video', 'vlc', runtime_pid)
|
||||
return DisplayProcess(process=None, event_type='video', event_id=event_id, log_file=None, log_path=None, player=mlp)
|
||||
|
||||
else:
|
||||
@@ -976,11 +1095,15 @@ class DisplayManager:
|
||||
if autoplay:
|
||||
try:
|
||||
mp.play()
|
||||
_force_fullscreen(mp, 'python-vlc MediaPlayer')
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to start media player: {e}")
|
||||
|
||||
event_id = self.get_event_identifier(event)
|
||||
logging.info("Video started via python-vlc (MediaPlayer)")
|
||||
runtime_pid = os.getpid()
|
||||
logging.info(f"Video started via python-vlc (MediaPlayer), runtime PID: {runtime_pid}")
|
||||
# python-vlc runs in-process (no external vlc child), so publish this process PID.
|
||||
self.health.update_running(event_id, 'video', 'vlc', runtime_pid)
|
||||
return DisplayProcess(process=None, event_type='video', event_id=event_id, log_file=None, log_path=None, player=mp)
|
||||
|
||||
except Exception as e:
|
||||
@@ -1137,6 +1260,10 @@ class DisplayManager:
|
||||
|
||||
event_id = self.get_event_identifier(event)
|
||||
logging.info(f"Webpage started with PID: {process.pid}")
|
||||
|
||||
# Update health state for monitoring (track chromium browser)
|
||||
browser_name = 'chromium-browser' if self._command_exists('chromium-browser') else 'chromium'
|
||||
self.health.update_running(event_id, etype, browser_name, process.pid)
|
||||
|
||||
# Inject auto-scroll JS via Chrome DevTools Protocol (CDP) if enabled and available
|
||||
if autoscroll_enabled:
|
||||
@@ -1367,9 +1494,20 @@ class DisplayManager:
|
||||
if self.current_process.event_type == 'presentation' and exit_code == 0:
|
||||
logging.info("Presentation process ended with exit code 0 (likely normal completion).")
|
||||
self.current_process = None
|
||||
# Update health state to show normal completion
|
||||
self.health.update_stopped()
|
||||
# Don't turn off TV yet - event might still be active
|
||||
return
|
||||
|
||||
# Process crashed unexpectedly
|
||||
self.health.update_crashed()
|
||||
self.health.update_restart_attempt()
|
||||
|
||||
if self.health.restart_count > self.health.max_restarts:
|
||||
logging.error(f"Max restart attempts exceeded - giving up on {self.health.process_name}")
|
||||
self.current_process = None
|
||||
return
|
||||
|
||||
logging.info("Restarting display process...")
|
||||
self.current_process = None
|
||||
# Don't turn off TV when restarting same event
|
||||
@@ -1413,10 +1551,8 @@ class DisplayManager:
|
||||
|
||||
# Log timezone information for debugging
|
||||
now_utc = datetime.now(timezone.utc)
|
||||
now_local = datetime.now()
|
||||
logging.info(f"Current time (UTC): {now_utc.strftime('%Y-%m-%d %H:%M:%S %Z')}")
|
||||
logging.info(f"Current time (Local): {now_local.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
logging.info("Note: Event times are expected in UTC format")
|
||||
logging.info("Event times are expected in UTC format")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
@@ -1469,7 +1605,7 @@ class DisplayManager:
|
||||
Maintains a 'latest.jpg' file and rotates older screenshots beyond SCREENSHOT_MAX_FILES.
|
||||
"""
|
||||
try:
|
||||
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
ts = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
|
||||
raw_path = os.path.join(self.screenshot_dir, f'screenshot_{ts}.png')
|
||||
final_path = os.path.join(self.screenshot_dir, f'screenshot_{ts}.jpg')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user