From 3107d0f67169db32f9dcea6efbde25e73e2efdb3 Mon Sep 17 00:00:00 2001 From: olafn Date: Tue, 10 Mar 2026 07:33:38 +0000 Subject: [PATCH] feat(monitoring): add server-side client logging and health infrastructure - add Alembic migration c1d2e3f4g5h6 for client monitoring: - create client_logs table with FK to clients.uuid and performance indexes - extend clients with process/health tracking fields - extend data model with ClientLog, LogLevel, ProcessStatus, and ScreenHealthStatus - enhance listener MQTT handling: - subscribe to logs and health topics - persist client logs from infoscreen/{uuid}/logs/{level} - process health payloads and enrich heartbeat-derived client state - add monitoring API blueprint server/routes/client_logs.py: - GET /api/client-logs//logs - GET /api/client-logs/summary - GET /api/client-logs/recent-errors - GET /api/client-logs/test - register client_logs blueprint in server/wsgi.py - align compose/dev runtime for listener live-code execution - add client-side implementation docs: - CLIENT_MONITORING_SPECIFICATION.md - CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md - update TECH-CHANGELOG.md and copilot-instructions.md: - document monitoring changes - codify post-release technical-notes/no-version-bump convention --- .github/copilot-instructions.md | 28 +- CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md | 757 ++++++++++++++ CLIENT_MONITORING_SPECIFICATION.md | 972 ++++++++++++++++++ TECH-CHANGELOG.md | 45 + docker-compose.yml | 1 + listener/listener.py | 123 ++- models/models.py | 46 + .../c1d2e3f4g5h6_add_client_monitoring.py | 84 ++ server/routes/client_logs.py | 255 +++++ server/wsgi.py | 2 + 10 files changed, 2307 insertions(+), 6 deletions(-) create mode 100644 CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md create mode 100644 CLIENT_MONITORING_SPECIFICATION.md create mode 100644 server/alembic/versions/c1d2e3f4g5h6_add_client_monitoring.py create mode 100644 server/routes/client_logs.py diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index e479ba0..d8c898a 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -124,9 +124,11 @@ Keep docs synced with code. When you change services/MQTT/API/UTC/env or dev/pro - Scheduler queries a future window (default: 7 days) to expand recurring events using RFC 5545 rules, applies event exceptions (skipped dates, detached occurrences), and publishes only events that are active at the current time (UTC). When a group has no active events, the scheduler clears its retained topic by publishing an empty list. Time comparisons are UTC; naive timestamps are normalized. Logging is concise; conversion lookups are cached and logged only once per media. - MQTT topics (paho-mqtt v2, use Callback API v2): - Discovery: `infoscreen/discovery` (JSON includes `uuid`, hw/ip data). ACK to `infoscreen/{uuid}/discovery_ack`. See `listener/listener.py`. - - Heartbeat: `infoscreen/{uuid}/heartbeat` updates `Client.last_alive` (UTC). + - Heartbeat: `infoscreen/{uuid}/heartbeat` updates `Client.last_alive` (UTC); enhanced payload includes `current_process`, `process_pid`, `process_status`, `current_event_id`. - Event lists (retained): `infoscreen/events/{group_id}` from `scheduler/scheduler.py`. - Per-client group assignment (retained): `infoscreen/{uuid}/group_id` via `server/mqtt_helper.py`. + - Client logs: `infoscreen/{uuid}/logs/{error|warn|info}` with JSON payload (timestamp, message, context); QoS 1 for ERROR/WARN, QoS 0 for INFO. + - Client health: `infoscreen/{uuid}/health` with metrics (expected_state, actual_state, health_metrics); QoS 0, published every 5 seconds. - Screenshots: server-side folders `server/received_screenshots/` and `server/screenshots/`; Nginx exposes `/screenshots/{uuid}.jpg` via `server/wsgi.py` route. - Dev Container guidance: If extensions reappear inside the container, remove UI-only extensions from `devcontainer.json` `extensions` and map them in `remote.extensionKind` as `"ui"`. @@ -146,6 +148,11 @@ Keep docs synced with code. When you change services/MQTT/API/UTC/env or dev/pro - `locked_until`: TIMESTAMP placeholder for account lockout (infrastructure in place, not yet enforced) - `deactivated_at`, `deactivated_by`: Soft-delete audit trail (FK self-reference); soft deactivation is the default, hard delete superadmin-only - Role hierarchy (privilege escalation enforced): `user` < `editor` < `admin` < `superadmin` +- Client monitoring (migration: `c1d2e3f4g5h6_add_client_monitoring.py`): + - `ClientLog` model: Centralized log storage with fields (id, client_uuid, timestamp, level, message, context, created_at); FK to clients.uuid (CASCADE) + - `Client` model extended: 7 health monitoring fields (`current_event_id`, `current_process`, `process_status`, `process_pid`, `last_screenshot_analyzed`, `screen_health_status`, `last_screenshot_hash`) + - Enums: `LogLevel` (ERROR, WARN, INFO, DEBUG), `ProcessStatus` (running, crashed, starting, stopped), `ScreenHealthStatus` (OK, BLACK, FROZEN, UNKNOWN) + - Indexes: (client_uuid, timestamp DESC), (level, timestamp DESC), (created_at DESC) for performance - System settings: `system_settings` key–value store via `SystemSetting` for global configuration (e.g., WebUntis/Vertretungsplan supplement-table). Managed through routes in `server/routes/system_settings.py`. - Presentation defaults (system-wide): - `presentation_interval` (seconds, default "10") @@ -189,6 +196,11 @@ Keep docs synced with code. When you change services/MQTT/API/UTC/env or dev/pro - `PUT /api/users//password` — admin password reset (requires backend check to reject self-reset for consistency) - `DELETE /api/users/` — hard delete (superadmin only, with self-deletion check) - Auth routes (`server/routes/auth.py`): Enhanced to track login events (sets `last_login_at`, resets `failed_login_attempts` on success; increments `failed_login_attempts` and `last_failed_login_at` on failure). Self-service password change via `PUT /api/auth/change-password` requires current password verification. + - Client logs (`server/routes/client_logs.py`): Centralized log retrieval for monitoring: + - `GET /api/client-logs//logs` – Query client logs with filters (level, limit, since); admin_or_higher + - `GET /api/client-logs/summary` – Log counts by level per client (last 24h); admin_or_higher + - `GET /api/client-logs/recent-errors` – System-wide error monitoring; admin_or_higher + - `GET /api/client-logs/test` – Infrastructure validation (no auth); returns recent logs with counts Documentation maintenance: keep this file aligned with real patterns; update when routes/session/UTC rules change. Avoid long prose; link exact paths. @@ -364,7 +376,8 @@ Docs maintenance guardrails (solo-friendly): Update this file alongside code cha ## Quick examples - Add client description persists to DB and publishes group via MQTT: see `PUT /api/clients//description` in `routes/clients.py`. - Bulk group assignment emits retained messages for each client: `PUT /api/clients/group`. -- Listener heartbeat path: `infoscreen//heartbeat` → sets `clients.last_alive`. +- Listener heartbeat path: `infoscreen//heartbeat` → sets `clients.last_alive` and captures process health data. +- Client monitoring flow: Client publishes to `infoscreen/{uuid}/logs/error` → listener stores in `client_logs` table → API serves via `/api/client-logs//logs` → dashboard displays (Phase 4, pending). ## Scheduler payloads: presentation extras - Presentation event payloads now include `page_progress` and `auto_progress` in addition to `slide_interval` and media files. These are sourced from per-event fields in the database (with system defaults applied on event creation). @@ -393,3 +406,14 @@ Questions or unclear areas? Tell us if you need: exact devcontainer debugging st - Breaking changes must be prefixed with `BREAKING:` - Keep ≤ 8–10 bullets; summarize or group micro-changes - JSON hygiene: valid JSON, no trailing commas, don’t edit historical entries except typos + +## Versioning Convention (Tech vs UI) + +- Use one unified app version across technical and user-facing release notes. +- `dashboard/public/program-info.json` is user-facing and should list only user-visible changes. +- `TECH-CHANGELOG.md` can include deeper technical details for the same released version. +- If server/infrastructure work is implemented but not yet released or not user-visible, document it under the latest released section as: + - `Backend technical work (post-release notes; no version bump)` +- Do not create a new version header in `TECH-CHANGELOG.md` for internal milestones alone. +- Bump version numbers when a release is actually cut/deployed (or when user-facing release notes are published), not for intermediate backend-only steps. +- When UI integration lands later, include the user-visible part in the next release version and reference prior post-release technical groundwork when useful. diff --git a/CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md b/CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md new file mode 100644 index 0000000..db98366 --- /dev/null +++ b/CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md @@ -0,0 +1,757 @@ +# 🚀 Client Monitoring Implementation Guide + +**Phase-based implementation guide for basic monitoring in development phase** + +--- + +## ✅ Phase 1: Server-Side Database Foundation +**Status:** ✅ COMPLETE +**Dependencies:** None - Already implemented +**Time estimate:** Completed + +### ✅ Step 1.1: Database Migration +**File:** `server/alembic/versions/c1d2e3f4g5h6_add_client_monitoring.py` +**What it does:** +- Creates `client_logs` table for centralized logging +- Adds health monitoring columns to `clients` table +- Creates indexes for efficient querying + +**To apply:** +```bash +cd /workspace/server +alembic upgrade head +``` + +### ✅ Step 1.2: Update Data Models +**File:** `models/models.py` +**What was added:** +- New enums: `LogLevel`, `ProcessStatus`, `ScreenHealthStatus` +- Updated `Client` model with health tracking fields +- New `ClientLog` model for log storage + +--- + +## 🔧 Phase 2: Server-Side Backend Logic +**Status:** 🚧 IN PROGRESS +**Dependencies:** Phase 1 complete +**Time estimate:** 2-3 hours + +### Step 2.1: Extend MQTT Listener +**File:** `listener/listener.py` +**What to add:** + +```python +# Add new topic subscriptions in on_connect(): +client.subscribe("infoscreen/+/logs/error") +client.subscribe("infoscreen/+/logs/warn") +client.subscribe("infoscreen/+/logs/info") # Dev mode only +client.subscribe("infoscreen/+/health") + +# Add new handler in on_message(): +def handle_log_message(uuid, level, payload): + """Store client log in database""" + from models.models import ClientLog, LogLevel + from server.database import Session + import json + + session = Session() + try: + log_entry = ClientLog( + client_uuid=uuid, + timestamp=payload.get('timestamp', datetime.now(timezone.utc)), + level=LogLevel[level], + message=payload.get('message', ''), + context=json.dumps(payload.get('context', {})) + ) + session.add(log_entry) + session.commit() + print(f"[LOG] {uuid} {level}: {payload.get('message', '')}") + except Exception as e: + print(f"Error saving log: {e}") + session.rollback() + finally: + session.close() + +def handle_health_message(uuid, payload): + """Update client health status""" + from models.models import Client, ProcessStatus + from server.database import Session + + session = Session() + try: + client = session.query(Client).filter_by(uuid=uuid).first() + if client: + client.current_event_id = payload.get('expected_state', {}).get('event_id') + client.current_process = payload.get('actual_state', {}).get('process') + + status_str = payload.get('actual_state', {}).get('status') + if status_str: + client.process_status = ProcessStatus[status_str] + + client.process_pid = payload.get('actual_state', {}).get('pid') + session.commit() + except Exception as e: + print(f"Error updating health: {e}") + session.rollback() + finally: + session.close() +``` + +**Topic routing logic:** +```python +# In on_message callback, add routing: +if topic.endswith('/logs/error'): + handle_log_message(uuid, 'ERROR', payload) +elif topic.endswith('/logs/warn'): + handle_log_message(uuid, 'WARN', payload) +elif topic.endswith('/logs/info'): + handle_log_message(uuid, 'INFO', payload) +elif topic.endswith('/health'): + handle_health_message(uuid, payload) +``` + +### Step 2.2: Create API Routes +**File:** `server/routes/client_logs.py` (NEW) + +```python +from flask import Blueprint, jsonify, request +from server.database import Session +from server.permissions import admin_or_higher +from models.models import ClientLog, Client +from sqlalchemy import desc +import json + +client_logs_bp = Blueprint("client_logs", __name__, url_prefix="/api/client-logs") + +@client_logs_bp.route("//logs", methods=["GET"]) +@admin_or_higher +def get_client_logs(uuid): + """ + Get logs for a specific client + Query params: + - level: ERROR, WARN, INFO, DEBUG (optional) + - limit: number of entries (default 50, max 500) + - since: ISO timestamp (optional) + """ + session = Session() + try: + level = request.args.get('level') + limit = min(int(request.args.get('limit', 50)), 500) + since = request.args.get('since') + + query = session.query(ClientLog).filter_by(client_uuid=uuid) + + if level: + from models.models import LogLevel + query = query.filter_by(level=LogLevel[level]) + + if since: + from datetime import datetime + since_dt = datetime.fromisoformat(since.replace('Z', '+00:00')) + query = query.filter(ClientLog.timestamp >= since_dt) + + logs = query.order_by(desc(ClientLog.timestamp)).limit(limit).all() + + result = [] + for log in logs: + result.append({ + "id": log.id, + "timestamp": log.timestamp.isoformat() if log.timestamp else None, + "level": log.level.value if log.level else None, + "message": log.message, + "context": json.loads(log.context) if log.context else {} + }) + + session.close() + return jsonify({"logs": result, "count": len(result)}) + + except Exception as e: + session.close() + return jsonify({"error": str(e)}), 500 + +@client_logs_bp.route("/summary", methods=["GET"]) +@admin_or_higher +def get_logs_summary(): + """Get summary of errors/warnings across all clients""" + session = Session() + try: + from sqlalchemy import func + from models.models import LogLevel + from datetime import datetime, timedelta + + # Last 24 hours + since = datetime.utcnow() - timedelta(hours=24) + + stats = session.query( + ClientLog.client_uuid, + ClientLog.level, + func.count(ClientLog.id).label('count') + ).filter( + ClientLog.timestamp >= since + ).group_by( + ClientLog.client_uuid, + ClientLog.level + ).all() + + result = {} + for stat in stats: + uuid = stat.client_uuid + if uuid not in result: + result[uuid] = {"ERROR": 0, "WARN": 0, "INFO": 0} + result[uuid][stat.level.value] = stat.count + + session.close() + return jsonify({"summary": result, "period_hours": 24}) + + except Exception as e: + session.close() + return jsonify({"error": str(e)}), 500 +``` + +**Register in `server/wsgi.py`:** +```python +from server.routes.client_logs import client_logs_bp +app.register_blueprint(client_logs_bp) +``` + +### Step 2.3: Add Health Data to Heartbeat Handler +**File:** `listener/listener.py` (extend existing heartbeat handler) + +```python +# Modify existing heartbeat handler to capture health data +def on_message(client, userdata, message): + topic = message.topic + + # Existing heartbeat logic... + if '/heartbeat' in topic: + uuid = extract_uuid_from_topic(topic) + try: + payload = json.loads(message.payload.decode()) + + # Update last_alive (existing) + session = Session() + client_obj = session.query(Client).filter_by(uuid=uuid).first() + if client_obj: + client_obj.last_alive = datetime.now(timezone.utc) + + # NEW: Update health data if present in heartbeat + if 'process_status' in payload: + client_obj.process_status = ProcessStatus[payload['process_status']] + if 'current_process' in payload: + client_obj.current_process = payload['current_process'] + if 'process_pid' in payload: + client_obj.process_pid = payload['process_pid'] + if 'current_event_id' in payload: + client_obj.current_event_id = payload['current_event_id'] + + session.commit() + session.close() + except Exception as e: + print(f"Error processing heartbeat: {e}") +``` + +--- + +## 🖥️ Phase 3: Client-Side Implementation +**Status:** ⏳ PENDING (After Phase 2) +**Dependencies:** Phase 2 complete +**Time estimate:** 3-4 hours + +### Step 3.1: Create Client Watchdog Script +**File:** `client/watchdog.py` (NEW - on client device) + +```python +#!/usr/bin/env python3 +""" +Client-side process watchdog +Monitors VLC, Chromium, PDF viewer and reports health to server +""" +import psutil +import paho.mqtt.client as mqtt +import json +import time +from datetime import datetime, timezone +import sys +import os + +class MediaWatchdog: + def __init__(self, client_uuid, mqtt_broker, mqtt_port=1883): + self.uuid = client_uuid + self.mqtt_client = mqtt.Client() + self.mqtt_client.connect(mqtt_broker, mqtt_port, 60) + self.mqtt_client.loop_start() + + self.current_process = None + self.current_event_id = None + self.restart_attempts = 0 + self.MAX_RESTARTS = 3 + + def send_log(self, level, message, context=None): + """Send log message to server via MQTT""" + topic = f"infoscreen/{self.uuid}/logs/{level.lower()}" + payload = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "message": message, + "context": context or {} + } + self.mqtt_client.publish(topic, json.dumps(payload), qos=1) + print(f"[{level}] {message}") + + def send_health(self, process_name, pid, status, event_id=None): + """Send health status to server""" + topic = f"infoscreen/{self.uuid}/health" + payload = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "expected_state": { + "event_id": event_id + }, + "actual_state": { + "process": process_name, + "pid": pid, + "status": status # 'running', 'crashed', 'starting', 'stopped' + } + } + self.mqtt_client.publish(topic, json.dumps(payload), qos=1, retain=False) + + def is_process_running(self, process_name): + """Check if a process is running""" + for proc in psutil.process_iter(['name', 'pid']): + try: + if process_name.lower() in proc.info['name'].lower(): + return proc.info['pid'] + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def monitor_loop(self): + """Main monitoring loop""" + print(f"Watchdog started for client {self.uuid}") + self.send_log("INFO", "Watchdog service started", {"uuid": self.uuid}) + + while True: + try: + # Check expected process (would be set by main event handler) + if self.current_process: + pid = self.is_process_running(self.current_process) + + if pid: + # Process is running + self.send_health( + self.current_process, + pid, + "running", + self.current_event_id + ) + self.restart_attempts = 0 # Reset on success + else: + # Process crashed + self.send_log( + "ERROR", + f"Process {self.current_process} crashed or stopped", + { + "event_id": self.current_event_id, + "process": self.current_process, + "restart_attempt": self.restart_attempts + } + ) + + if self.restart_attempts < self.MAX_RESTARTS: + self.send_log("WARN", f"Attempting restart ({self.restart_attempts + 1}/{self.MAX_RESTARTS})") + self.restart_attempts += 1 + # TODO: Implement restart logic (call event handler) + else: + self.send_log("ERROR", "Max restart attempts exceeded", { + "event_id": self.current_event_id + }) + + time.sleep(5) # Check every 5 seconds + + except KeyboardInterrupt: + print("Watchdog stopped by user") + break + except Exception as e: + self.send_log("ERROR", f"Watchdog error: {str(e)}", { + "exception": str(e), + "traceback": str(sys.exc_info()) + }) + time.sleep(10) # Wait longer on error + +if __name__ == "__main__": + import sys + if len(sys.argv) < 3: + print("Usage: python watchdog.py ") + sys.exit(1) + + uuid = sys.argv[1] + broker = sys.argv[2] + + watchdog = MediaWatchdog(uuid, broker) + watchdog.monitor_loop() +``` + +### Step 3.2: Integrate with Existing Event Handler +**File:** `client/event_handler.py` (modify existing) + +```python +# When starting a new event, notify watchdog +def play_event(event_data): + event_type = event_data.get('event_type') + event_id = event_data.get('id') + + if event_type == 'video': + process_name = 'vlc' + # Start VLC... + elif event_type == 'website': + process_name = 'chromium' + # Start Chromium... + elif event_type == 'presentation': + process_name = 'pdf_viewer' # or your PDF tool + # Start PDF viewer... + + # Notify watchdog about expected process + watchdog.current_process = process_name + watchdog.current_event_id = event_id + watchdog.restart_attempts = 0 +``` + +### Step 3.3: Enhanced Heartbeat Payload +**File:** `client/heartbeat.py` (modify existing) + +```python +# Modify existing heartbeat to include process status +def send_heartbeat(mqtt_client, uuid): + # Get current process status + current_process = None + process_pid = None + process_status = "stopped" + + # Check if expected process is running + if watchdog.current_process: + pid = watchdog.is_process_running(watchdog.current_process) + if pid: + current_process = watchdog.current_process + process_pid = pid + process_status = "running" + + payload = { + "uuid": uuid, + "timestamp": datetime.now(timezone.utc).isoformat(), + # Existing fields... + # NEW health fields: + "current_process": current_process, + "process_pid": process_pid, + "process_status": process_status, + "current_event_id": watchdog.current_event_id + } + + mqtt_client.publish(f"infoscreen/{uuid}/heartbeat", json.dumps(payload)) +``` + +--- + +## 🎨 Phase 4: Dashboard UI Integration +**Status:** ⏳ PENDING (After Phase 3) +**Dependencies:** Phases 2 & 3 complete +**Time estimate:** 2-3 hours + +### Step 4.1: Create Log Viewer Component +**File:** `dashboard/src/ClientLogs.tsx` (NEW) + +```typescript +import React from 'react'; +import { GridComponent, ColumnsDirective, ColumnDirective, Page, Inject } from '@syncfusion/ej2-react-grids'; + +interface LogEntry { + id: number; + timestamp: string; + level: 'ERROR' | 'WARN' | 'INFO' | 'DEBUG'; + message: string; + context: any; +} + +interface ClientLogsProps { + clientUuid: string; +} + +export const ClientLogs: React.FC = ({ clientUuid }) => { + const [logs, setLogs] = React.useState([]); + const [loading, setLoading] = React.useState(false); + + const loadLogs = async (level?: string) => { + setLoading(true); + try { + const params = new URLSearchParams({ limit: '50' }); + if (level) params.append('level', level); + + const response = await fetch(`/api/client-logs/${clientUuid}/logs?${params}`); + const data = await response.json(); + setLogs(data.logs); + } catch (err) { + console.error('Failed to load logs:', err); + } finally { + setLoading(false); + } + }; + + React.useEffect(() => { + loadLogs(); + const interval = setInterval(() => loadLogs(), 30000); // Refresh every 30s + return () => clearInterval(interval); + }, [clientUuid]); + + const levelTemplate = (props: any) => { + const colors = { + ERROR: 'text-red-600 bg-red-100', + WARN: 'text-yellow-600 bg-yellow-100', + INFO: 'text-blue-600 bg-blue-100', + DEBUG: 'text-gray-600 bg-gray-100' + }; + return ( + + {props.level} + + ); + }; + + return ( +
+
+ + + + +
+ + + + + + + + + +
+ ); +}; +``` + +### Step 4.2: Add Health Indicators to Client Cards +**File:** `dashboard/src/clients.tsx` (modify existing) + +```typescript +// Add health indicator to client card +const getHealthBadge = (client: Client) => { + if (!client.process_status) { + return Unknown; + } + + const badges = { + running: ✓ Running, + crashed: ✗ Crashed, + starting: ⟳ Starting, + stopped: ■ Stopped + }; + + return badges[client.process_status] || null; +}; + +// In client card render: +
+

{client.hostname || client.uuid}

+
Status: {getHealthBadge(client)}
+
Process: {client.current_process || 'None'}
+
Event ID: {client.current_event_id || 'None'}
+ +
+``` + +### Step 4.3: Add System Health Dashboard (Superadmin) +**File:** `dashboard/src/SystemMonitor.tsx` (NEW) + +```typescript +import React from 'react'; +import { ClientLogs } from './ClientLogs'; + +export const SystemMonitor: React.FC = () => { + const [summary, setSummary] = React.useState({}); + + const loadSummary = async () => { + const response = await fetch('/api/client-logs/summary'); + const data = await response.json(); + setSummary(data.summary); + }; + + React.useEffect(() => { + loadSummary(); + const interval = setInterval(loadSummary, 30000); + return () => clearInterval(interval); + }, []); + + return ( +
+

System Health Monitor (Superadmin)

+ +
+

Active Issues

+ {Object.entries(summary).map(([uuid, stats]: [string, any]) => ( + stats.ERROR > 0 || stats.WARN > 5 ? ( +
+ 🔴 {uuid}: {stats.ERROR} errors, {stats.WARN} warnings (24h) +
+ ) : null + ))} +
+ + {/* Real-time log stream */} +
+

Recent Logs (All Clients)

+ {/* Implement real-time log aggregation */} +
+
+ ); +}; +``` + +--- + +## 🧪 Phase 5: Testing & Validation +**Status:** ⏳ PENDING +**Dependencies:** All previous phases +**Time estimate:** 1-2 hours + +### Step 5.1: Server-Side Tests + +```bash +# Test database migration +cd /workspace/server +alembic upgrade head +alembic downgrade -1 +alembic upgrade head + +# Test API endpoints +curl -X GET "http://localhost:8000/api/client-logs//logs?limit=10" +curl -X GET "http://localhost:8000/api/client-logs/summary" +``` + +### Step 5.2: Client-Side Tests + +```bash +# On client device +python3 watchdog.py + +# Simulate process crash +pkill vlc # Should trigger error log and restart attempt + +# Check MQTT messages +mosquitto_sub -h -t "infoscreen/+/logs/#" -v +mosquitto_sub -h -t "infoscreen/+/health" -v +``` + +### Step 5.3: Dashboard Tests + +1. Open dashboard and navigate to Clients page +2. Verify health indicators show correct status +3. Click "View Logs" and verify logs appear +4. Navigate to System Monitor (superadmin) +5. Verify summary statistics are correct + +--- + +## 📝 Configuration Summary + +### Environment Variables + +**Server (docker-compose.yml):** +```yaml +- LOG_RETENTION_DAYS=90 # How long to keep logs +- DEBUG_MODE=true # Enable INFO level logging via MQTT +``` + +**Client:** +```bash +export MQTT_BROKER="your-server-ip" +export CLIENT_UUID="abc-123-def" +export WATCHDOG_ENABLED=true +``` + +### MQTT Topics Reference + +| Topic Pattern | Direction | Purpose | +|--------------|-----------|---------| +| `infoscreen/{uuid}/logs/error` | Client → Server | Error messages | +| `infoscreen/{uuid}/logs/warn` | Client → Server | Warning messages | +| `infoscreen/{uuid}/logs/info` | Client → Server | Info (dev only) | +| `infoscreen/{uuid}/health` | Client → Server | Health metrics | +| `infoscreen/{uuid}/heartbeat` | Client → Server | Enhanced heartbeat | + +### Database Tables + +**client_logs:** +- Stores all centralized logs +- Indexed by client_uuid, timestamp, level +- Auto-cleanup after 90 days (recommended) + +**clients (extended):** +- `current_event_id`: Which event should be playing +- `current_process`: Expected process name +- `process_status`: running/crashed/starting/stopped +- `process_pid`: Process ID +- `screen_health_status`: OK/BLACK/FROZEN/UNKNOWN +- `last_screenshot_analyzed`: Last analysis time +- `last_screenshot_hash`: For frozen detection + +--- + +## 🎯 Next Steps After Implementation + +1. **Deploy Phase 1-2** to staging environment +2. **Test with 1-2 pilot clients** before full rollout +3. **Monitor traffic & performance** (should be minimal) +4. **Fine-tune log levels** based on actual noise +5. **Add alerting** (email/Slack when errors > threshold) +6. **Implement screenshot analysis** (Phase 2 enhancement) +7. **Add trending/analytics** (which clients are least reliable) + +--- + +## 🚨 Troubleshooting + +**Logs not appearing in database:** +- Check MQTT broker logs: `docker logs infoscreen-mqtt` +- Verify listener subscriptions: Check `listener/listener.py` logs +- Test MQTT manually: `mosquitto_pub -h broker -t "infoscreen/test/logs/error" -m '{"message":"test"}'` + +**High database growth:** +- Check log_retention cleanup cronjob +- Reduce INFO level logging frequency +- Add sampling (log every 10th occurrence instead of all) + +**Client watchdog not detecting crashes:** +- Verify psutil can see processes: `ps aux | grep vlc` +- Check permissions (may need sudo for some process checks) +- Increase monitor loop frequency for faster detection + +--- + +## ✅ Completion Checklist + +- [ ] Phase 1: Database migration applied +- [ ] Phase 2: Listener extended for log topics +- [ ] Phase 2: API endpoints created and tested +- [ ] Phase 3: Client watchdog implemented +- [ ] Phase 3: Enhanced heartbeat deployed +- [ ] Phase 4: Dashboard log viewer working +- [ ] Phase 4: Health indicators visible +- [ ] Phase 5: End-to-end testing complete +- [ ] Documentation updated with new features +- [ ] Production deployment plan created + +--- + +**Last Updated:** 2026-03-09 +**Author:** GitHub Copilot +**For:** Infoscreen 2025 Project diff --git a/CLIENT_MONITORING_SPECIFICATION.md b/CLIENT_MONITORING_SPECIFICATION.md new file mode 100644 index 0000000..dba65c3 --- /dev/null +++ b/CLIENT_MONITORING_SPECIFICATION.md @@ -0,0 +1,972 @@ +# Client-Side Monitoring Specification + +**Version:** 1.0 +**Date:** 2026-03-10 +**For:** Infoscreen Client Implementation +**Server Endpoint:** `192.168.43.201:8000` (or your production server) +**MQTT Broker:** `192.168.43.201:1883` (or your production MQTT broker) + +--- + +## 1. Overview + +Each infoscreen client must implement health monitoring and logging capabilities to report status to the central server via MQTT. + +### 1.1 Goals +- **Detect failures:** Process crashes, frozen screens, content mismatches +- **Provide visibility:** Real-time health status visible on server dashboard +- **Enable remote diagnosis:** Centralized log storage for debugging +- **Auto-recovery:** Attempt automatic restart on failure + +### 1.2 Architecture +``` +┌─────────────────────────────────────────┐ +│ Infoscreen Client │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ Media Player │ │ Watchdog │ │ +│ │ (VLC/Chrome) │◄───│ Monitor │ │ +│ └──────────────┘ └──────┬───────┘ │ +│ │ │ +│ ┌──────────────┐ │ │ +│ │ Event Mgr │ │ │ +│ │ (receives │ │ │ +│ │ schedule) │◄───────────┘ │ +│ └──────┬───────┘ │ +│ │ │ +│ ┌──────▼───────────────────────┐ │ +│ │ MQTT Client │ │ +│ │ - Heartbeat (every 60s) │ │ +│ │ - Logs (error/warn/info) │ │ +│ │ - Health metrics (every 5s) │ │ +│ └──────┬────────────────────────┘ │ +└─────────┼──────────────────────────────┘ + │ + │ MQTT over TCP + ▼ + ┌─────────────┐ + │ MQTT Broker │ + │ (server) │ + └─────────────┘ +``` + +--- + +## 2. MQTT Protocol Specification + +### 2.1 Connection Parameters +``` +Broker: 192.168.43.201 (or DNS hostname) +Port: 1883 (standard MQTT) +Protocol: MQTT v3.1.1 +Client ID: "infoscreen-{client_uuid}" +Clean Session: false (retain subscriptions) +Keep Alive: 60 seconds +Username/Password: (if configured on broker) +``` + +### 2.2 QoS Levels +- **Heartbeat:** QoS 0 (fire and forget, high frequency) +- **Logs (ERROR/WARN):** QoS 1 (at least once delivery, important) +- **Logs (INFO):** QoS 0 (optional, high volume) +- **Health metrics:** QoS 0 (frequent, latest value matters) + +--- + +## 3. Topic Structure & Payload Formats + +### 3.1 Log Messages + +#### Topic Pattern: +``` +infoscreen/{client_uuid}/logs/{level} +``` + +Where `{level}` is one of: `error`, `warn`, `info` + +#### Payload Format (JSON): +```json +{ + "timestamp": "2026-03-10T07:30:00Z", + "message": "Human-readable error description", + "context": { + "event_id": 42, + "process": "vlc", + "error_code": "NETWORK_TIMEOUT", + "additional_key": "any relevant data" + } +} +``` + +#### Field Specifications: +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `timestamp` | string (ISO 8601 UTC) | Yes | When the event occurred. Use `YYYY-MM-DDTHH:MM:SSZ` format | +| `message` | string | Yes | Human-readable description of the event (max 1000 chars) | +| `context` | object | No | Additional structured data (will be stored as JSON) | + +#### Example Topics: +``` +infoscreen/9b8d1856-ff34-4864-a726-12de072d0f77/logs/error +infoscreen/9b8d1856-ff34-4864-a726-12de072d0f77/logs/warn +infoscreen/9b8d1856-ff34-4864-a726-12de072d0f77/logs/info +``` + +#### When to Send Logs: + +**ERROR (Always send):** +- Process crashed (VLC/Chromium/PDF viewer terminated unexpectedly) +- Content failed to load (404, network timeout, corrupt file) +- Hardware failure detected (display off, audio device missing) +- Exception caught in main event loop +- Maximum restart attempts exceeded + +**WARN (Always send):** +- Process restarted automatically (after crash) +- High resource usage (CPU >80%, RAM >90%) +- Slow performance (frame drops, lag) +- Non-critical failures (screenshot capture failed, cache full) +- Fallback content displayed (primary source unavailable) + +**INFO (Send in development, optional in production):** +- Process started successfully +- Event transition (switched from video to presentation) +- Content loaded successfully +- Watchdog service started/stopped + +--- + +### 3.2 Health Metrics + +#### Topic Pattern: +``` +infoscreen/{client_uuid}/health +``` + +#### Payload Format (JSON): +```json +{ + "timestamp": "2026-03-10T07:30:00Z", + "expected_state": { + "event_id": 42, + "event_type": "video", + "media_file": "presentation.mp4", + "started_at": "2026-03-10T07:15:00Z" + }, + "actual_state": { + "process": "vlc", + "pid": 1234, + "status": "running", + "uptime_seconds": 900, + "position": 45.3, + "duration": 180.0 + }, + "health_metrics": { + "screen_on": true, + "last_frame_update": "2026-03-10T07:29:58Z", + "frames_dropped": 2, + "network_errors": 0, + "cpu_percent": 15.3, + "memory_mb": 234 + } +} +``` + +#### Field Specifications: + +**expected_state:** +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `event_id` | integer | Yes | Current event ID from scheduler | +| `event_type` | string | Yes | `presentation`, `video`, `website`, `webuntis`, `message` | +| `media_file` | string | No | Filename or URL of current content | +| `started_at` | string (ISO 8601) | Yes | When this event started playing | + +**actual_state:** +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `process` | string | Yes | `vlc`, `chromium`, `pdf_viewer`, `none` | +| `pid` | integer | No | Process ID (if running) | +| `status` | string | Yes | `running`, `crashed`, `starting`, `stopped` | +| `uptime_seconds` | integer | No | How long process has been running | +| `position` | float | No | Current playback position (seconds, for video/audio) | +| `duration` | float | No | Total content duration (seconds) | + +**health_metrics:** +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `screen_on` | boolean | Yes | Is display powered on? | +| `last_frame_update` | string (ISO 8601) | No | Last time screen content changed | +| `frames_dropped` | integer | No | Video frames dropped (performance indicator) | +| `network_errors` | integer | No | Count of network errors in last interval | +| `cpu_percent` | float | No | CPU usage (0-100) | +| `memory_mb` | integer | No | RAM usage in megabytes | + +#### Sending Frequency: +- **Normal operation:** Every 5 seconds +- **During startup/transition:** Every 1 second +- **After error:** Immediately + every 2 seconds until recovered + +--- + +### 3.3 Enhanced Heartbeat + +The existing heartbeat topic should be enhanced to include process status. + +#### Topic Pattern: +``` +infoscreen/{client_uuid}/heartbeat +``` + +#### Enhanced Payload Format (JSON): +```json +{ + "uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "timestamp": "2026-03-10T07:30:00Z", + "current_process": "vlc", + "process_pid": 1234, + "process_status": "running", + "current_event_id": 42 +} +``` + +#### New Fields (add to existing heartbeat): +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `current_process` | string | No | Name of active media player process | +| `process_pid` | integer | No | Process ID | +| `process_status` | string | No | `running`, `crashed`, `starting`, `stopped` | +| `current_event_id` | integer | No | Event ID currently being displayed | + +#### Sending Frequency: +- Keep existing: **Every 60 seconds** +- Include new fields if available + +--- + +## 4. Process Monitoring Requirements + +### 4.1 Processes to Monitor + +| Media Type | Process Name | How to Detect | +|------------|--------------|---------------| +| Video | `vlc` | `ps aux \| grep vlc` or `pgrep vlc` | +| Website/WebUntis | `chromium` or `chromium-browser` | `pgrep chromium` | +| PDF Presentation | `evince`, `okular`, or custom viewer | `pgrep {viewer_name}` | + +### 4.2 Monitoring Checks (Every 5 seconds) + +#### Check 1: Process Alive +``` +Goal: Verify expected process is running +Method: + - Get list of running processes (psutil or `ps`) + - Check if expected process name exists + - Match PID if known +Result: + - If missing → status = "crashed" + - If found → status = "running" +Action on crash: + - Send ERROR log immediately + - Attempt restart (max 3 attempts) + - Send WARN log on each restart + - If max restarts exceeded → send ERROR log, display fallback +``` + +#### Check 2: Process Responsive +``` +Goal: Detect frozen processes +Method: + - For VLC: Query HTTP interface (status.json) + - For Chromium: Use DevTools Protocol (CDP) + - For custom viewers: Check last screen update time +Result: + - If same frame >30 seconds → likely frozen + - If playback position not advancing → frozen +Action on freeze: + - Send WARN log + - Force refresh (reload page, seek video, next slide) + - If refresh fails → restart process +``` + +#### Check 3: Content Match +``` +Goal: Verify correct content is displayed +Method: + - Compare expected event_id with actual media/URL + - Check scheduled time window (is event still active?) +Result: + - Mismatch → content error +Action: + - Send WARN log + - Reload correct event from scheduler +``` + +--- + +## 5. Process Control Interface Requirements + +### 5.1 VLC Control + +**Requirement:** Enable VLC HTTP interface for monitoring + +**Launch Command:** +```bash +vlc --intf http --http-host 127.0.0.1 --http-port 8080 --http-password "vlc_password" \ + --fullscreen --loop /path/to/video.mp4 +``` + +**Status Query:** +```bash +curl http://127.0.0.1:8080/requests/status.json --user ":vlc_password" +``` + +**Response Fields to Monitor:** +```json +{ + "state": "playing", // "playing", "paused", "stopped" + "position": 0.25, // 0.0-1.0 (25% through) + "time": 45, // seconds into playback + "length": 180, // total duration in seconds + "volume": 256 // 0-512 +} +``` + +--- + +### 5.2 Chromium Control + +**Requirement:** Enable Chrome DevTools Protocol (CDP) + +**Launch Command:** +```bash +chromium --remote-debugging-port=9222 --kiosk --app=https://example.com +``` + +**Status Query:** +```bash +curl http://127.0.0.1:9222/json +``` + +**Response Fields to Monitor:** +```json +[ + { + "url": "https://example.com", + "title": "Page Title", + "type": "page" + } +] +``` + +**Advanced:** Use CDP WebSocket for events (page load, navigation, errors) + +--- + +### 5.3 PDF Viewer (Custom or Standard) + +**Option A: Standard Viewer (e.g., Evince)** +- No built-in API +- Monitor via process check + screenshot comparison + +**Option B: Custom Python Viewer** +- Implement REST API for status queries +- Track: current page, total pages, last transition time + +--- + +## 6. Watchdog Service Architecture + +### 6.1 Service Components + +**Component 1: Process Monitor Thread** +``` +Responsibilities: + - Check process alive every 5 seconds + - Detect crashes and frozen processes + - Attempt automatic restart + - Send health metrics via MQTT + +State Machine: + IDLE → STARTING → RUNNING → (if crash) → RESTARTING → RUNNING + → (if max restarts) → FAILED +``` + +**Component 2: MQTT Publisher Thread** +``` +Responsibilities: + - Maintain MQTT connection + - Send heartbeat every 60 seconds + - Send logs on-demand (queued from other components) + - Send health metrics every 5 seconds + - Reconnect on connection loss +``` + +**Component 3: Event Manager Integration** +``` +Responsibilities: + - Receive event schedule from server + - Notify watchdog of expected process/content + - Launch media player processes + - Handle event transitions +``` + +### 6.2 Service Lifecycle + +**On Startup:** +1. Load configuration (client UUID, MQTT broker, etc.) +2. Connect to MQTT broker +3. Send INFO log: "Watchdog service started" +4. Wait for first event from scheduler + +**During Operation:** +1. Monitor loop runs every 5 seconds +2. Check expected vs actual process state +3. Send health metrics +4. Handle failures (log + restart) + +**On Shutdown:** +1. Send INFO log: "Watchdog service stopping" +2. Gracefully stop monitored processes +3. Disconnect from MQTT +4. Exit cleanly + +--- + +## 7. Auto-Recovery Logic + +### 7.1 Restart Strategy + +**Step 1: Detect Failure** +``` +Trigger: Process not found in process list +Action: + - Log ERROR: "Process {name} crashed" + - Increment restart counter + - Check if within retry limit (max 3) +``` + +**Step 2: Attempt Restart** +``` +If restart_attempts < MAX_RESTARTS: + - Log WARN: "Attempting restart ({attempt}/{MAX_RESTARTS})" + - Kill any zombie processes + - Wait 2 seconds (cooldown) + - Launch process with same parameters + - Wait 5 seconds for startup + - Verify process is running + - If success: reset restart counter, log INFO + - If fail: increment counter, repeat +``` + +**Step 3: Permanent Failure** +``` +If restart_attempts >= MAX_RESTARTS: + - Log ERROR: "Max restart attempts exceeded, failing over" + - Display fallback content (static image with error message) + - Send notification to server (separate alert topic, optional) + - Wait for manual intervention or scheduler event change +``` + +### 7.2 Restart Cooldown + +**Purpose:** Prevent rapid restart loops that waste resources + +**Implementation:** +``` +After each restart attempt: + - Wait 2 seconds before next restart + - After 3 failures: wait 30 seconds before trying again + - Reset counter on successful run >5 minutes +``` + +--- + +## 8. Resource Monitoring + +### 8.1 System Metrics to Track + +**CPU Usage:** +``` +Method: Read /proc/stat or use psutil.cpu_percent() +Frequency: Every 5 seconds +Threshold: Warn if >80% for >60 seconds +``` + +**Memory Usage:** +``` +Method: Read /proc/meminfo or use psutil.virtual_memory() +Frequency: Every 5 seconds +Threshold: Warn if >90% for >30 seconds +``` + +**Display Status:** +``` +Method: Check DPMS state or xset query +Frequency: Every 30 seconds +Threshold: Error if display off (unexpected) +``` + +**Network Connectivity:** +``` +Method: Ping server or check MQTT connection +Frequency: Every 60 seconds +Threshold: Warn if no server connectivity +``` + +--- + +## 9. Development vs Production Mode + +### 9.1 Development Mode + +**Enable via:** Environment variable `DEBUG=true` or `ENV=development` + +**Behavior:** +- Send INFO level logs +- More verbose logging to console +- Shorter monitoring intervals (faster feedback) +- Screenshot capture every 30 seconds +- No rate limiting on logs + +### 9.2 Production Mode + +**Enable via:** `ENV=production` + +**Behavior:** +- Send only ERROR and WARN logs +- Minimal console output +- Standard monitoring intervals +- Screenshot capture every 60 seconds +- Rate limiting: max 10 logs per minute per level + +--- + +## 10. Configuration File Format + +### 10.1 Recommended Config: JSON + +**File:** `/etc/infoscreen/config.json` or `~/.config/infoscreen/config.json` + +```json +{ + "client": { + "uuid": "9b8d1856-ff34-4864-a726-12de072d0f77", + "hostname": "infoscreen-room-101" + }, + "mqtt": { + "broker": "192.168.43.201", + "port": 1883, + "username": "", + "password": "", + "keepalive": 60 + }, + "monitoring": { + "enabled": true, + "health_interval_seconds": 5, + "heartbeat_interval_seconds": 60, + "max_restart_attempts": 3, + "restart_cooldown_seconds": 2 + }, + "logging": { + "level": "INFO", + "send_info_logs": false, + "console_output": true, + "local_log_file": "/var/log/infoscreen/watchdog.log" + }, + "processes": { + "vlc": { + "http_port": 8080, + "http_password": "vlc_password" + }, + "chromium": { + "debug_port": 9222 + } + } +} +``` + +--- + +## 11. Error Scenarios & Expected Behavior + +### Scenario 1: VLC Crashes Mid-Video +``` +1. Watchdog detects: process_status = "crashed" +2. Send ERROR log: "VLC process crashed" +3. Attempt 1: Restart VLC with same video, seek to last position +4. If success: Send INFO log "VLC restarted successfully" +5. If fail: Repeat 2 more times +6. After 3 failures: Send ERROR "Max restarts exceeded", show fallback +``` + +### Scenario 2: Network Timeout Loading Website +``` +1. Chromium fails to load page (CDP reports error) +2. Send WARN log: "Page load timeout" +3. Attempt reload (Chromium refresh) +4. If success after 10s: Continue monitoring +5. If timeout again: Send ERROR, try restarting Chromium +``` + +### Scenario 3: Display Powers Off (Hardware) +``` +1. DPMS check detects display off +2. Send ERROR log: "Display powered off" +3. Attempt to wake display (xset dpms force on) +4. If success: Send INFO log +5. If fail: Hardware issue, alert admin +``` + +### Scenario 4: High CPU Usage +``` +1. CPU >80% for 60 seconds +2. Send WARN log: "High CPU usage: 85%" +3. Check if expected (e.g., video playback is normal) +4. If unexpected: investigate process causing it +5. If critical (>95%): consider restarting offending process +``` + +--- + +## 12. Testing & Validation + +### 12.1 Manual Tests (During Development) + +**Test 1: Process Crash Simulation** +```bash +# Start video, then kill VLC manually +killall vlc +# Expected: ERROR log sent, automatic restart within 5 seconds +``` + +**Test 2: MQTT Connectivity** +```bash +# Subscribe to all client topics on server +mosquitto_sub -h 192.168.43.201 -t "infoscreen/{uuid}/#" -v +# Expected: See heartbeat every 60s, health every 5s +``` + +**Test 3: Log Levels** +```bash +# Trigger error condition and verify log appears in database +curl http://192.168.43.201:8000/api/client-logs/test +# Expected: See new log entry with correct level/message +``` + +### 12.2 Acceptance Criteria + +✅ **Client must:** +1. Send heartbeat every 60 seconds without gaps +2. Send ERROR log within 5 seconds of process crash +3. Attempt automatic restart (max 3 times) +4. Report health metrics every 5 seconds +5. Survive MQTT broker restart (reconnect automatically) +6. Survive network interruption (buffer logs, send when reconnected) +7. Use correct timestamp format (ISO 8601 UTC) +8. Only send logs for real client UUID (FK constraint) + +--- + +## 13. Python Libraries (Recommended) + +**For process monitoring:** +- `psutil` - Cross-platform process and system utilities + +**For MQTT:** +- `paho-mqtt` - Official MQTT client (use v2.x with Callback API v2) + +**For VLC control:** +- `requests` - HTTP client for status queries + +**For Chromium control:** +- `websocket-client` or `pychrome` - Chrome DevTools Protocol + +**For datetime:** +- `datetime` (stdlib) - Use `datetime.now(timezone.utc).isoformat()` + +**Example requirements.txt:** +``` +paho-mqtt>=2.0.0 +psutil>=5.9.0 +requests>=2.31.0 +python-dateutil>=2.8.0 +``` + +--- + +## 14. Security Considerations + +### 14.1 MQTT Security +- If broker requires auth, store credentials in config file with restricted permissions (`chmod 600`) +- Consider TLS/SSL for MQTT (port 8883) if on untrusted network +- Use unique client ID to prevent impersonation + +### 14.2 Process Control APIs +- VLC HTTP password should be random, not default +- Chromium debug port should bind to `127.0.0.1` only (not `0.0.0.0`) +- Restrict file system access for media player processes + +### 14.3 Log Content +- **Do not log:** Passwords, API keys, personal data +- **Sanitize:** File paths (strip user directories), URLs (remove query params with tokens) + +--- + +## 15. Performance Targets + +| Metric | Target | Acceptable | Critical | +|--------|--------|------------|----------| +| Health check interval | 5s | 10s | 30s | +| Crash detection time | <5s | <10s | <30s | +| Restart time | <10s | <20s | <60s | +| MQTT publish latency | <100ms | <500ms | <2s | +| CPU usage (watchdog) | <2% | <5% | <10% | +| RAM usage (watchdog) | <50MB | <100MB | <200MB | +| Log message size | <1KB | <10KB | <100KB | + +--- + +## 16. Troubleshooting Guide (For Client Development) + +### Issue: Logs not appearing in server database +**Check:** +1. Is MQTT broker reachable? (`mosquitto_pub` test from client) +2. Is client UUID correct and exists in `clients` table? +3. Is timestamp format correct (ISO 8601 with 'Z')? +4. Check server listener logs for errors + +### Issue: Health metrics not updating +**Check:** +1. Is health loop running? (check watchdog service status) +2. Is MQTT connected? (check connection status in logs) +3. Is payload JSON valid? (use JSON validator) + +### Issue: Process restarts in loop +**Check:** +1. Is media file/URL accessible? +2. Is process command correct? (test manually) +3. Check process exit code (crash reason) +4. Increase restart cooldown to avoid rapid loops + +--- + +## 17. Complete Message Flow Diagram + +``` +┌─────────────────────────────────────────────────────────┐ +│ Infoscreen Client │ +│ │ +│ Event Occurs: │ +│ - Process crashed │ +│ - High CPU usage │ +│ - Content loaded │ +│ │ +│ ┌────────────────┐ │ +│ │ Decision Logic │ │ +│ │ - Is it ERROR?│ │ +│ │ - Is it WARN? │ │ +│ │ - Is it INFO? │ │ +│ └────────┬───────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────┐ │ +│ │ Build JSON Payload │ │ +│ │ { │ │ +│ │ "timestamp": "...", │ │ +│ │ "message": "...", │ │ +│ │ "context": {...} │ │ +│ │ } │ │ +│ └────────┬───────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌────────────────────────────────┐ │ +│ │ MQTT Publish │ │ +│ │ Topic: infoscreen/{uuid}/logs/error │ +│ │ QoS: 1 │ │ +│ └────────┬───────────────────────┘ │ +└───────────┼──────────────────────────────────────────┘ + │ + │ TCP/IP (MQTT Protocol) + │ + ▼ + ┌──────────────┐ + │ MQTT Broker │ + │ (Mosquitto) │ + └──────┬───────┘ + │ + │ Topic: infoscreen/+/logs/# + │ + ▼ + ┌──────────────────────────────┐ + │ Listener Service │ + │ (Python) │ + │ │ + │ - Parse JSON │ + │ - Validate UUID │ + │ - Store in database │ + └──────┬───────────────────────┘ + │ + ▼ + ┌──────────────────────────────┐ + │ MariaDB Database │ + │ │ + │ Table: client_logs │ + │ - client_uuid │ + │ - timestamp │ + │ - level │ + │ - message │ + │ - context (JSON) │ + └──────┬───────────────────────┘ + │ + │ SQL Query + │ + ▼ + ┌──────────────────────────────┐ + │ API Server (Flask) │ + │ │ + │ GET /api/client-logs/{uuid}/logs + │ GET /api/client-logs/summary + └──────┬───────────────────────┘ + │ + │ HTTP/JSON + │ + ▼ + ┌──────────────────────────────┐ + │ Dashboard (React) │ + │ │ + │ - Display logs │ + │ - Filter by level │ + │ - Show health status │ + └───────────────────────────────┘ +``` + +--- + +## 18. Quick Reference Card + +### MQTT Topics Summary +``` +infoscreen/{uuid}/logs/error → Critical failures +infoscreen/{uuid}/logs/warn → Non-critical issues +infoscreen/{uuid}/logs/info → Informational (dev mode) +infoscreen/{uuid}/health → Health metrics (every 5s) +infoscreen/{uuid}/heartbeat → Enhanced heartbeat (every 60s) +``` + +### JSON Timestamp Format +```python +from datetime import datetime, timezone +timestamp = datetime.now(timezone.utc).isoformat() +# Output: "2026-03-10T07:30:00+00:00" or "2026-03-10T07:30:00Z" +``` + +### Process Status Values +``` +"running" - Process is alive and responding +"crashed" - Process terminated unexpectedly +"starting" - Process is launching (startup phase) +"stopped" - Process intentionally stopped +``` + +### Restart Logic +``` +Max attempts: 3 +Cooldown: 2 seconds between attempts +Reset: After 5 minutes of successful operation +``` + +--- + +## 19. Contact & Support + +**Server API Documentation:** +- Base URL: `http://192.168.43.201:8000` +- Health check: `GET /health` +- Test logs: `GET /api/client-logs/test` (no auth) +- Full API docs: See `CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md` on server + +**MQTT Broker:** +- Host: `192.168.43.201` +- Port: `1883` (standard), `9001` (WebSocket) +- Test tool: `mosquitto_pub` / `mosquitto_sub` + +**Database Schema:** +- Table: `client_logs` +- Foreign Key: `client_uuid` → `clients.uuid` (ON DELETE CASCADE) +- Constraint: UUID must exist in clients table before logging + +**Server-Side Logs:** +```bash +# View listener logs (processes MQTT messages) +docker compose logs -f listener + +# View server logs (API requests) +docker compose logs -f server +``` + +--- + +## 20. Appendix: Example Implementations + +### A. Minimal Python Watchdog (Pseudocode) + +```python +import time +import json +import psutil +import paho.mqtt.client as mqtt +from datetime import datetime, timezone + +class MinimalWatchdog: + def __init__(self, client_uuid, mqtt_broker): + self.uuid = client_uuid + self.mqtt_client = mqtt.Client(callback_api_version=mqtt.CallbackAPIVersion.VERSION2) + self.mqtt_client.connect(mqtt_broker, 1883, 60) + self.mqtt_client.loop_start() + + self.expected_process = None + self.restart_attempts = 0 + self.MAX_RESTARTS = 3 + + def send_log(self, level, message, context=None): + topic = f"infoscreen/{self.uuid}/logs/{level}" + payload = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "message": message, + "context": context or {} + } + self.mqtt_client.publish(topic, json.dumps(payload), qos=1) + + def is_process_running(self, process_name): + for proc in psutil.process_iter(['name']): + if process_name in proc.info['name']: + return True + return False + + def monitor_loop(self): + while True: + if self.expected_process: + if not self.is_process_running(self.expected_process): + self.send_log("error", f"{self.expected_process} crashed") + if self.restart_attempts < self.MAX_RESTARTS: + self.restart_process() + else: + self.send_log("error", "Max restarts exceeded") + + time.sleep(5) + +# Usage: +watchdog = MinimalWatchdog("9b8d1856-ff34-4864-a726-12de072d0f77", "192.168.43.201") +watchdog.expected_process = "vlc" +watchdog.monitor_loop() +``` + +--- + +**END OF SPECIFICATION** + +Questions? Refer to: +- `CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md` (server repo) +- Server API: `http://192.168.43.201:8000/api/client-logs/test` +- MQTT test: `mosquitto_sub -h 192.168.43.201 -t infoscreen/#` diff --git a/TECH-CHANGELOG.md b/TECH-CHANGELOG.md index 577f99d..ea13eed 100644 --- a/TECH-CHANGELOG.md +++ b/TECH-CHANGELOG.md @@ -56,6 +56,51 @@ Notes for integrators: - CSS follows modern Material 3 color-function notation (`rgb(r g b / alpha%)`) - Syncfusion ScheduleComponent requires TimelineViews, Resize, and DragAndDrop modules injected +Backend technical work (post-release notes; no version bump): +- 📊 **Client Monitoring Infrastructure (Server-Side) (2026-03-10)**: + - Database schema: New Alembic migration `c1d2e3f4g5h6_add_client_monitoring.py` (idempotent) adds: + - `client_logs` table: Stores centralized logs with columns (id, client_uuid, timestamp, level, message, context, created_at) + - Foreign key: `client_logs.client_uuid` → `clients.uuid` (ON DELETE CASCADE) + - Health monitoring columns added to `clients` table: `current_event_id`, `current_process`, `process_status`, `process_pid`, `last_screenshot_analyzed`, `screen_health_status`, `last_screenshot_hash` + - Indexes for performance: (client_uuid, timestamp DESC), (level, timestamp DESC), (created_at DESC) + - Data models (`models/models.py`): + - New enums: `LogLevel` (ERROR, WARN, INFO, DEBUG), `ProcessStatus` (running, crashed, starting, stopped), `ScreenHealthStatus` (OK, BLACK, FROZEN, UNKNOWN) + - New model: `ClientLog` with foreign key to `Client` (CASCADE on delete) + - Extended `Client` model with 7 health monitoring fields + - MQTT listener extensions (`listener/listener.py`): + - New topic subscriptions: `infoscreen/+/logs/error`, `infoscreen/+/logs/warn`, `infoscreen/+/logs/info`, `infoscreen/+/health` + - Log handler: Parses JSON payloads, creates `ClientLog` entries, validates client UUID exists (FK constraint) + - Health handler: Updates client state from MQTT health messages + - Enhanced heartbeat handler: Captures `process_status`, `current_process`, `process_pid`, `current_event_id` from payload + - API endpoints (`server/routes/client_logs.py`): + - `GET /api/client-logs//logs` – Retrieve client logs with filters (level, limit, since); authenticated (admin_or_higher) + - `GET /api/client-logs/summary` – Get log counts by level per client for last 24h; authenticated (admin_or_higher) + - `GET /api/client-logs/recent-errors` – System-wide error monitoring; authenticated (admin_or_higher) + - `GET /api/client-logs/test` – Infrastructure validation endpoint (no auth required) + - Blueprint registered in `server/wsgi.py` as `client_logs_bp` + - Dev environment fix: Updated `docker-compose.override.yml` listener service to use `working_dir: /workspace` and direct command path for live code reload +- 📡 **MQTT Protocol Extensions**: + - New log topics: `infoscreen/{uuid}/logs/{error|warn|info}` with JSON payload (timestamp, message, context) + - New health topic: `infoscreen/{uuid}/health` with metrics (expected_state, actual_state, health_metrics) + - Enhanced heartbeat: `infoscreen/{uuid}/heartbeat` now includes `current_process`, `process_pid`, `process_status`, `current_event_id` + - QoS levels: ERROR/WARN logs use QoS 1 (at least once), INFO/health use QoS 0 (fire and forget) +- 📖 **Documentation**: + - New file: `CLIENT_MONITORING_SPECIFICATION.md` – Comprehensive 20-section technical spec for client-side implementation (MQTT protocol, process monitoring, auto-recovery, payload formats, testing guide) + - New file: `CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md` – 5-phase implementation guide (database, backend, client watchdog, dashboard UI, testing) + - Updated `.github/copilot-instructions.md`: Added MQTT topics section, client monitoring integration notes +- ✅ **Validation**: + - End-to-end testing completed: MQTT message → listener → database → API confirmed working + - Test flow: Published message to `infoscreen/{real-uuid}/logs/error` → listener logs showed receipt → database stored entry → test API returned log data + - Known client UUIDs validated: 9b8d1856-ff34-4864-a726-12de072d0f77, 7f65c615-5827-4ada-9ac8-4727c2e8ee55, bdbfff95-0b2b-4265-8cc7-b0284509540a + +Notes for integrators: +- Tiered logging strategy: ERROR/WARN always centralized (QoS 1), INFO dev-only (QoS 0), DEBUG local-only +- Client-side implementation pending (Phase 3: watchdog service) +- Dashboard UI pending (Phase 4: log viewer and health indicators) +- Foreign key constraint prevents logging for non-existent clients (data integrity enforced) +- Migration is idempotent and can be safely rerun after interruption +- Use `GET /api/client-logs/test` for quick infrastructure validation without authentication + ## 2025.1.0-beta.1 (TBD) - 🔐 **User Management & Role-Based Access Control**: - Backend: Implemented comprehensive user management API (`server/routes/users.py`) with 6 endpoints (GET, POST, PUT, DELETE users + password reset). diff --git a/docker-compose.yml b/docker-compose.yml index 615e91b..851401a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,6 +18,7 @@ services: environment: - DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} - DB_URL=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} + - API_BASE_URL=http://server:8000 - ENV=${ENV:-development} - FLASK_SECRET_KEY=${FLASK_SECRET_KEY:-dev-secret-key-change-in-production} - DEFAULT_SUPERADMIN_USERNAME=${DEFAULT_SUPERADMIN_USERNAME:-superadmin} diff --git a/listener/listener.py b/listener/listener.py index a152e47..b76eb9c 100644 --- a/listener/listener.py +++ b/listener/listener.py @@ -7,7 +7,7 @@ import requests import paho.mqtt.client as mqtt from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from models.models import Client +from models.models import Client, ClientLog, LogLevel, ProcessStatus logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s') # Load .env in development @@ -78,7 +78,14 @@ def on_connect(client, userdata, flags, reasonCode, properties): client.subscribe("infoscreen/+/heartbeat") client.subscribe("infoscreen/+/screenshot") client.subscribe("infoscreen/+/dashboard") - logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, and dashboards") + + # Subscribe to monitoring topics + client.subscribe("infoscreen/+/logs/error") + client.subscribe("infoscreen/+/logs/warn") + client.subscribe("infoscreen/+/logs/info") + client.subscribe("infoscreen/+/health") + + logging.info(f"MQTT connected (reasonCode: {reasonCode}); (re)subscribed to discovery, heartbeats, screenshots, dashboards, logs, and health") except Exception as e: logging.error(f"Subscribe failed on connect: {e}") @@ -124,15 +131,123 @@ def on_message(client, userdata, msg): # Heartbeat-Handling if topic.startswith("infoscreen/") and topic.endswith("/heartbeat"): uuid = topic.split("/")[1] + try: + # Parse payload to get optional health data + payload_data = json.loads(msg.payload.decode()) + except (json.JSONDecodeError, UnicodeDecodeError): + payload_data = {} + session = Session() client_obj = session.query(Client).filter_by(uuid=uuid).first() if client_obj: client_obj.last_alive = datetime.datetime.now(datetime.UTC) + + # Update health fields if present in heartbeat + if 'process_status' in payload_data: + try: + client_obj.process_status = ProcessStatus[payload_data['process_status']] + except (KeyError, TypeError): + pass + + if 'current_process' in payload_data: + client_obj.current_process = payload_data.get('current_process') + + if 'process_pid' in payload_data: + client_obj.process_pid = payload_data.get('process_pid') + + if 'current_event_id' in payload_data: + client_obj.current_event_id = payload_data.get('current_event_id') + session.commit() - logging.info( - f"Heartbeat von {uuid} empfangen, last_alive (UTC) aktualisiert.") + logging.info(f"Heartbeat von {uuid} empfangen, last_alive (UTC) aktualisiert.") session.close() return + + # Log-Handling (ERROR, WARN, INFO) + if topic.startswith("infoscreen/") and "/logs/" in topic: + parts = topic.split("/") + if len(parts) >= 4: + uuid = parts[1] + level_str = parts[3].upper() # 'error', 'warn', 'info' -> 'ERROR', 'WARN', 'INFO' + + try: + payload_data = json.loads(msg.payload.decode()) + message = payload_data.get('message', '') + timestamp_str = payload_data.get('timestamp') + context = payload_data.get('context', {}) + + # Parse timestamp or use current time + if timestamp_str: + try: + log_timestamp = datetime.datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) + if log_timestamp.tzinfo is None: + log_timestamp = log_timestamp.replace(tzinfo=datetime.UTC) + except ValueError: + log_timestamp = datetime.datetime.now(datetime.UTC) + else: + log_timestamp = datetime.datetime.now(datetime.UTC) + + # Store in database + session = Session() + try: + log_level = LogLevel[level_str] + log_entry = ClientLog( + client_uuid=uuid, + timestamp=log_timestamp, + level=log_level, + message=message, + context=json.dumps(context) if context else None + ) + session.add(log_entry) + session.commit() + logging.info(f"[{level_str}] {uuid}: {message}") + except Exception as e: + logging.error(f"Error saving log from {uuid}: {e}") + session.rollback() + finally: + session.close() + + except (json.JSONDecodeError, UnicodeDecodeError) as e: + logging.error(f"Could not parse log payload from {uuid}: {e}") + return + + # Health-Handling + if topic.startswith("infoscreen/") and topic.endswith("/health"): + uuid = topic.split("/")[1] + try: + payload_data = json.loads(msg.payload.decode()) + + session = Session() + client_obj = session.query(Client).filter_by(uuid=uuid).first() + if client_obj: + # Update expected state + expected = payload_data.get('expected_state', {}) + if 'event_id' in expected: + client_obj.current_event_id = expected['event_id'] + + # Update actual state + actual = payload_data.get('actual_state', {}) + if 'process' in actual: + client_obj.current_process = actual['process'] + + if 'pid' in actual: + client_obj.process_pid = actual['pid'] + + if 'status' in actual: + try: + client_obj.process_status = ProcessStatus[actual['status']] + except (KeyError, TypeError): + pass + + session.commit() + logging.debug(f"Health update from {uuid}: {actual.get('process')} ({actual.get('status')})") + session.close() + + except (json.JSONDecodeError, UnicodeDecodeError) as e: + logging.error(f"Could not parse health payload from {uuid}: {e}") + except Exception as e: + logging.error(f"Error processing health from {uuid}: {e}") + return # Discovery-Handling if topic == "infoscreen/discovery": diff --git a/models/models.py b/models/models.py index c1a9980..089d2b8 100644 --- a/models/models.py +++ b/models/models.py @@ -21,6 +21,27 @@ class AcademicPeriodType(enum.Enum): trimester = "trimester" +class LogLevel(enum.Enum): + ERROR = "ERROR" + WARN = "WARN" + INFO = "INFO" + DEBUG = "DEBUG" + + +class ProcessStatus(enum.Enum): + running = "running" + crashed = "crashed" + starting = "starting" + stopped = "stopped" + + +class ScreenHealthStatus(enum.Enum): + OK = "OK" + BLACK = "BLACK" + FROZEN = "FROZEN" + UNKNOWN = "UNKNOWN" + + class User(Base): __tablename__ = 'users' id = Column(Integer, primary_key=True, autoincrement=True) @@ -106,6 +127,31 @@ class Client(Base): is_active = Column(Boolean, default=True, nullable=False) group_id = Column(Integer, ForeignKey( 'client_groups.id'), nullable=False, default=1) + + # Health monitoring fields + current_event_id = Column(Integer, nullable=True) + current_process = Column(String(50), nullable=True) # 'vlc', 'chromium', 'pdf_viewer' + process_status = Column(Enum(ProcessStatus), nullable=True) + process_pid = Column(Integer, nullable=True) + last_screenshot_analyzed = Column(TIMESTAMP(timezone=True), nullable=True) + screen_health_status = Column(Enum(ScreenHealthStatus), nullable=True, server_default='UNKNOWN') + last_screenshot_hash = Column(String(32), nullable=True) + + +class ClientLog(Base): + __tablename__ = 'client_logs' + id = Column(Integer, primary_key=True, autoincrement=True) + client_uuid = Column(String(36), ForeignKey('clients.uuid', ondelete='CASCADE'), nullable=False, index=True) + timestamp = Column(TIMESTAMP(timezone=True), nullable=False, index=True) + level = Column(Enum(LogLevel), nullable=False, index=True) + message = Column(Text, nullable=False) + context = Column(Text, nullable=True) # JSON stored as text + created_at = Column(TIMESTAMP(timezone=True), server_default=func.current_timestamp(), nullable=False) + + __table_args__ = ( + Index('ix_client_logs_client_timestamp', 'client_uuid', 'timestamp'), + Index('ix_client_logs_level_timestamp', 'level', 'timestamp'), + ) class EventType(enum.Enum): diff --git a/server/alembic/versions/c1d2e3f4g5h6_add_client_monitoring.py b/server/alembic/versions/c1d2e3f4g5h6_add_client_monitoring.py new file mode 100644 index 0000000..15adbd4 --- /dev/null +++ b/server/alembic/versions/c1d2e3f4g5h6_add_client_monitoring.py @@ -0,0 +1,84 @@ +"""add client monitoring tables and columns + +Revision ID: c1d2e3f4g5h6 +Revises: 4f0b8a3e5c20 +Create Date: 2026-03-09 21:08:38.000000 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = 'c1d2e3f4g5h6' +down_revision = '4f0b8a3e5c20' +branch_labels = None +depends_on = None + + +def upgrade(): + bind = op.get_bind() + inspector = sa.inspect(bind) + + # 1. Add health monitoring columns to clients table (safe on rerun) + existing_client_columns = {c['name'] for c in inspector.get_columns('clients')} + if 'current_event_id' not in existing_client_columns: + op.add_column('clients', sa.Column('current_event_id', sa.Integer(), nullable=True)) + if 'current_process' not in existing_client_columns: + op.add_column('clients', sa.Column('current_process', sa.String(50), nullable=True)) + if 'process_status' not in existing_client_columns: + op.add_column('clients', sa.Column('process_status', sa.Enum('running', 'crashed', 'starting', 'stopped', name='processstatus'), nullable=True)) + if 'process_pid' not in existing_client_columns: + op.add_column('clients', sa.Column('process_pid', sa.Integer(), nullable=True)) + if 'last_screenshot_analyzed' not in existing_client_columns: + op.add_column('clients', sa.Column('last_screenshot_analyzed', sa.TIMESTAMP(timezone=True), nullable=True)) + if 'screen_health_status' not in existing_client_columns: + op.add_column('clients', sa.Column('screen_health_status', sa.Enum('OK', 'BLACK', 'FROZEN', 'UNKNOWN', name='screenhealthstatus'), nullable=True, server_default='UNKNOWN')) + if 'last_screenshot_hash' not in existing_client_columns: + op.add_column('clients', sa.Column('last_screenshot_hash', sa.String(32), nullable=True)) + + # 2. Create client_logs table (safe on rerun) + if not inspector.has_table('client_logs'): + op.create_table('client_logs', + sa.Column('id', sa.Integer(), autoincrement=True, nullable=False), + sa.Column('client_uuid', sa.String(36), nullable=False), + sa.Column('timestamp', sa.TIMESTAMP(timezone=True), nullable=False), + sa.Column('level', sa.Enum('ERROR', 'WARN', 'INFO', 'DEBUG', name='loglevel'), nullable=False), + sa.Column('message', sa.Text(), nullable=False), + sa.Column('context', sa.JSON(), nullable=True), + sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.current_timestamp(), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.ForeignKeyConstraint(['client_uuid'], ['clients.uuid'], ondelete='CASCADE'), + mysql_charset='utf8mb4', + mysql_collate='utf8mb4_unicode_ci', + mysql_engine='InnoDB' + ) + + # 3. Create indexes for efficient querying (safe on rerun) + client_log_indexes = {idx['name'] for idx in inspector.get_indexes('client_logs')} if inspector.has_table('client_logs') else set() + client_indexes = {idx['name'] for idx in inspector.get_indexes('clients')} + + if 'ix_client_logs_client_timestamp' not in client_log_indexes: + op.create_index('ix_client_logs_client_timestamp', 'client_logs', ['client_uuid', 'timestamp']) + if 'ix_client_logs_level_timestamp' not in client_log_indexes: + op.create_index('ix_client_logs_level_timestamp', 'client_logs', ['level', 'timestamp']) + if 'ix_clients_process_status' not in client_indexes: + op.create_index('ix_clients_process_status', 'clients', ['process_status']) + + +def downgrade(): + # Drop indexes + op.drop_index('ix_clients_process_status', table_name='clients') + op.drop_index('ix_client_logs_level_timestamp', table_name='client_logs') + op.drop_index('ix_client_logs_client_timestamp', table_name='client_logs') + + # Drop table + op.drop_table('client_logs') + + # Drop columns from clients + op.drop_column('clients', 'last_screenshot_hash') + op.drop_column('clients', 'screen_health_status') + op.drop_column('clients', 'last_screenshot_analyzed') + op.drop_column('clients', 'process_pid') + op.drop_column('clients', 'process_status') + op.drop_column('clients', 'current_process') + op.drop_column('clients', 'current_event_id') diff --git a/server/routes/client_logs.py b/server/routes/client_logs.py new file mode 100644 index 0000000..c3df644 --- /dev/null +++ b/server/routes/client_logs.py @@ -0,0 +1,255 @@ +from flask import Blueprint, jsonify, request +from server.database import Session +from server.permissions import admin_or_higher +from models.models import ClientLog, Client, LogLevel +from sqlalchemy import desc, func +from datetime import datetime, timedelta, timezone +import json + +client_logs_bp = Blueprint("client_logs", __name__, url_prefix="/api/client-logs") + + +@client_logs_bp.route("/test", methods=["GET"]) +def test_client_logs(): + """Test endpoint to verify logging infrastructure (no auth required)""" + session = Session() + try: + # Count total logs + total_logs = session.query(func.count(ClientLog.id)).scalar() + + # Count by level + error_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.ERROR).scalar() + warn_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.WARN).scalar() + info_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.INFO).scalar() + + # Get last 5 logs + recent_logs = session.query(ClientLog).order_by(desc(ClientLog.timestamp)).limit(5).all() + + recent = [] + for log in recent_logs: + recent.append({ + "client_uuid": log.client_uuid, + "level": log.level.value if log.level else None, + "message": log.message, + "timestamp": log.timestamp.isoformat() if log.timestamp else None + }) + + session.close() + return jsonify({ + "status": "ok", + "infrastructure": "working", + "total_logs": total_logs, + "counts": { + "ERROR": error_count, + "WARN": warn_count, + "INFO": info_count + }, + "recent_5": recent + }) + except Exception as e: + session.close() + return jsonify({"status": "error", "message": str(e)}), 500 + + +@client_logs_bp.route("//logs", methods=["GET"]) +@admin_or_higher +def get_client_logs(uuid): + """ + Get logs for a specific client + Query params: + - level: ERROR, WARN, INFO, DEBUG (optional) + - limit: number of entries (default 50, max 500) + - since: ISO timestamp (optional) + + Example: /api/client-logs/abc-123/logs?level=ERROR&limit=100 + """ + session = Session() + try: + # Verify client exists + client = session.query(Client).filter_by(uuid=uuid).first() + if not client: + session.close() + return jsonify({"error": "Client not found"}), 404 + + # Parse query parameters + level_param = request.args.get('level') + limit = min(int(request.args.get('limit', 50)), 500) + since_param = request.args.get('since') + + # Build query + query = session.query(ClientLog).filter_by(client_uuid=uuid) + + # Filter by log level + if level_param: + try: + level_enum = LogLevel[level_param.upper()] + query = query.filter_by(level=level_enum) + except KeyError: + session.close() + return jsonify({"error": f"Invalid level: {level_param}. Must be ERROR, WARN, INFO, or DEBUG"}), 400 + + # Filter by timestamp + if since_param: + try: + # Handle both with and without 'Z' suffix + since_str = since_param.replace('Z', '+00:00') + since_dt = datetime.fromisoformat(since_str) + if since_dt.tzinfo is None: + since_dt = since_dt.replace(tzinfo=timezone.utc) + query = query.filter(ClientLog.timestamp >= since_dt) + except ValueError: + session.close() + return jsonify({"error": "Invalid timestamp format. Use ISO 8601"}), 400 + + # Execute query + logs = query.order_by(desc(ClientLog.timestamp)).limit(limit).all() + + # Format results + result = [] + for log in logs: + entry = { + "id": log.id, + "timestamp": log.timestamp.isoformat() if log.timestamp else None, + "level": log.level.value if log.level else None, + "message": log.message, + "context": {} + } + + # Parse context JSON + if log.context: + try: + entry["context"] = json.loads(log.context) + except json.JSONDecodeError: + entry["context"] = {"raw": log.context} + + result.append(entry) + + session.close() + return jsonify({ + "client_uuid": uuid, + "logs": result, + "count": len(result), + "limit": limit + }) + + except Exception as e: + session.close() + return jsonify({"error": f"Server error: {str(e)}"}), 500 + + +@client_logs_bp.route("/summary", methods=["GET"]) +@admin_or_higher +def get_logs_summary(): + """ + Get summary of errors/warnings across all clients in last 24 hours + Returns count of ERROR, WARN, INFO logs per client + + Example response: + { + "summary": { + "client-uuid-1": {"ERROR": 5, "WARN": 12, "INFO": 45}, + "client-uuid-2": {"ERROR": 0, "WARN": 3, "INFO": 20} + }, + "period_hours": 24, + "timestamp": "2026-03-09T21:00:00Z" + } + """ + session = Session() + try: + # Get hours parameter (default 24, max 168 = 1 week) + hours = min(int(request.args.get('hours', 24)), 168) + since = datetime.now(timezone.utc) - timedelta(hours=hours) + + # Query log counts grouped by client and level + stats = session.query( + ClientLog.client_uuid, + ClientLog.level, + func.count(ClientLog.id).label('count') + ).filter( + ClientLog.timestamp >= since + ).group_by( + ClientLog.client_uuid, + ClientLog.level + ).all() + + # Build summary dictionary + summary = {} + for stat in stats: + uuid = stat.client_uuid + if uuid not in summary: + # Initialize all levels to 0 + summary[uuid] = { + "ERROR": 0, + "WARN": 0, + "INFO": 0, + "DEBUG": 0 + } + + summary[uuid][stat.level.value] = stat.count + + # Get client info for enrichment + clients = session.query(Client.uuid, Client.hostname, Client.description).all() + client_info = {c.uuid: {"hostname": c.hostname, "description": c.description} for c in clients} + + # Enrich summary with client info + enriched_summary = {} + for uuid, counts in summary.items(): + enriched_summary[uuid] = { + "counts": counts, + "info": client_info.get(uuid, {}) + } + + session.close() + return jsonify({ + "summary": enriched_summary, + "period_hours": hours, + "since": since.isoformat(), + "timestamp": datetime.now(timezone.utc).isoformat() + }) + + except Exception as e: + session.close() + return jsonify({"error": f"Server error: {str(e)}"}), 500 + + +@client_logs_bp.route("/recent-errors", methods=["GET"]) +@admin_or_higher +def get_recent_errors(): + """ + Get recent ERROR logs across all clients + Query params: + - limit: number of entries (default 20, max 100) + + Useful for system-wide error monitoring + """ + session = Session() + try: + limit = min(int(request.args.get('limit', 20)), 100) + + # Get recent errors from all clients + logs = session.query(ClientLog).filter_by( + level=LogLevel.ERROR + ).order_by( + desc(ClientLog.timestamp) + ).limit(limit).all() + + result = [] + for log in logs: + entry = { + "id": log.id, + "client_uuid": log.client_uuid, + "timestamp": log.timestamp.isoformat() if log.timestamp else None, + "message": log.message, + "context": json.loads(log.context) if log.context else {} + } + result.append(entry) + + session.close() + return jsonify({ + "errors": result, + "count": len(result) + }) + + except Exception as e: + session.close() + return jsonify({"error": f"Server error: {str(e)}"}), 500 diff --git a/server/wsgi.py b/server/wsgi.py index 9b3781f..53197c8 100644 --- a/server/wsgi.py +++ b/server/wsgi.py @@ -8,6 +8,7 @@ from server.routes.holidays import holidays_bp from server.routes.academic_periods import academic_periods_bp from server.routes.groups import groups_bp from server.routes.clients import clients_bp +from server.routes.client_logs import client_logs_bp from server.routes.auth import auth_bp from server.routes.users import users_bp from server.routes.system_settings import system_settings_bp @@ -46,6 +47,7 @@ else: app.register_blueprint(auth_bp) app.register_blueprint(users_bp) app.register_blueprint(clients_bp) +app.register_blueprint(client_logs_bp) app.register_blueprint(groups_bp) app.register_blueprint(events_bp) app.register_blueprint(event_exceptions_bp)