From 9c330f984f9c01cc95f9c7d0ae203f3bcf279c13 Mon Sep 17 00:00:00 2001 From: Olaf Date: Tue, 24 Mar 2026 11:18:33 +0000 Subject: [PATCH] feat(monitoring): complete monitoring pipeline and fix presentation flag persistence add superadmin monitoring dashboard with protected route, menu entry, and monitoring data client add monitoring overview API endpoint and improve log serialization/aggregation for dashboard use extend listener health/log handling with robust status/event/timestamp normalization and screenshot payload extraction improve screenshot persistence and retrieval (timestamp-aware uploads, latest screenshot endpoint fallback) fix page_progress and auto_progress persistence/serialization across create, update, and detached occurrence flows align technical and project docs to reflect implemented monitoring and no-version-bump backend changes add documentation sync log entry and include minor compose env indentation cleanup --- .github/copilot-instructions.md | 20 +- AI-INSTRUCTIONS-MAINTENANCE.md | 3 + CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md | 30 +- CLIENT_MONITORING_SPECIFICATION.md | 7 + DEV-CHANGELOG.md | 4 + PHASE_3_CLIENT_MONITORING_IMPLEMENTATION.md | 533 +++++++++++++++++++ README.md | 12 +- TECH-CHANGELOG.md | 11 +- dashboard/src/App.tsx | 21 +- dashboard/src/apiClientMonitoring.ts | 106 ++++ dashboard/src/monitoring.css | 347 +++++++++++++ dashboard/src/monitoring.tsx | 534 ++++++++++++++++++++ docker-compose.yml | 4 +- listener/listener.py | 252 +++++++-- server/routes/client_logs.py | 246 ++++++++- server/routes/clients.py | 42 +- server/routes/events.py | 17 + server/wsgi.py | 10 +- 18 files changed, 2095 insertions(+), 104 deletions(-) create mode 100644 PHASE_3_CLIENT_MONITORING_IMPLEMENTATION.md create mode 100644 dashboard/src/apiClientMonitoring.ts create mode 100644 dashboard/src/monitoring.css create mode 100644 dashboard/src/monitoring.tsx diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index d8c898a..77f6451 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -34,6 +34,7 @@ Keep docs synced with code. When you change services/MQTT/API/UTC/env or dev/pro - `dashboard/src/settings.tsx` — settings UI (nested tabs; system defaults for presentations and videos) - `dashboard/src/ressourcen.tsx` — timeline view showing all groups' active events in parallel - `dashboard/src/ressourcen.css` — timeline and resource view styling +- `dashboard/src/monitoring.tsx` — superadmin-only monitoring dashboard for client health, screenshots, and logs @@ -54,7 +55,16 @@ Keep docs synced with code. When you change services/MQTT/API/UTC/env or dev/pro ## Recent changes since last commit - ### Latest (January 2026) + ### Latest (March 2026) + + - **Monitoring System Completion (no version bump)**: + - End-to-end monitoring pipeline completed: MQTT logs/health → listener persistence → monitoring APIs → superadmin dashboard + - API now serves aggregated monitoring via `GET /api/client-logs/monitoring-overview` and system-wide recent errors via `GET /api/client-logs/recent-errors` + - Monitoring dashboard (`dashboard/src/monitoring.tsx`) is active and displays client health states, screenshots, process metadata, and recent log activity + - **Presentation Flags Persistence Fix**: + - Fixed persistence for presentation `page_progress` and `auto_progress` to ensure values are reliably stored and returned across create/update paths and detached occurrences + + ### Earlier (January 2026) - **Ressourcen Page (Timeline View)**: - New 'Ressourcen' page with parallel timeline view showing active events for all room groups @@ -258,6 +268,12 @@ Keep docs synced with code. When you change services/MQTT/API/UTC/env or dev/pro - API client in `dashboard/src/apiUsers.ts` for all user operations (listUsers, getUser, createUser, updateUser, resetUserPassword, deleteUser) - Menu visibility: "Benutzer" menu item only visible to admin+ (role-gated in App.tsx) +- Monitoring page (`dashboard/src/monitoring.tsx`): + - Superadmin-only dashboard for client monitoring and diagnostics; menu item is hidden for lower roles and the route redirects non-superadmins. + - Uses `GET /api/client-logs/monitoring-overview` for aggregated live status, `GET /api/client-logs/recent-errors` for system-wide errors, and `GET /api/client-logs//logs` for per-client details. + - Shows per-client status (`healthy`, `warning`, `critical`, `offline`) based on heartbeat freshness, process state, screen state, and recent log counts. + - Displays latest screenshot preview from `/screenshots/{uuid}.jpg`, current process metadata, and recent ERROR/WARN activity. + - Settings page (`dashboard/src/settings.tsx`): - Structure: Syncfusion TabComponent with role-gated tabs - 📅 Academic Calendar (all users) @@ -377,7 +393,7 @@ Docs maintenance guardrails (solo-friendly): Update this file alongside code cha - Add client description persists to DB and publishes group via MQTT: see `PUT /api/clients//description` in `routes/clients.py`. - Bulk group assignment emits retained messages for each client: `PUT /api/clients/group`. - Listener heartbeat path: `infoscreen//heartbeat` → sets `clients.last_alive` and captures process health data. -- Client monitoring flow: Client publishes to `infoscreen/{uuid}/logs/error` → listener stores in `client_logs` table → API serves via `/api/client-logs//logs` → dashboard displays (Phase 4, pending). +- Client monitoring flow: Client publishes to `infoscreen/{uuid}/logs/error` and `infoscreen/{uuid}/health` → listener stores/updates monitoring state → API serves `/api/client-logs/monitoring-overview`, `/api/client-logs/recent-errors`, and `/api/client-logs//logs` → superadmin monitoring dashboard displays live status. ## Scheduler payloads: presentation extras - Presentation event payloads now include `page_progress` and `auto_progress` in addition to `slide_interval` and media files. These are sourced from per-event fields in the database (with system defaults applied on event creation). diff --git a/AI-INSTRUCTIONS-MAINTENANCE.md b/AI-INSTRUCTIONS-MAINTENANCE.md index e0c5361..b4b254d 100644 --- a/AI-INSTRUCTIONS-MAINTENANCE.md +++ b/AI-INSTRUCTIONS-MAINTENANCE.md @@ -98,3 +98,6 @@ exit 0 # warn only; do not block commit - MQTT workers: `listener/listener.py`, `scheduler/scheduler.py`, `server/mqtt_helper.py` - Frontend: `dashboard/vite.config.ts`, `dashboard/package.json`, `dashboard/src/*` - Dev/Prod docs: `deployment.md`, `.env.example` + +## Documentation sync log +- 2026-03-24: Synced docs for completed monitoring rollout and presentation flag persistence fix (`page_progress` / `auto_progress`). Updated `.github/copilot-instructions.md`, `README.md`, `TECH-CHANGELOG.md`, `DEV-CHANGELOG.md`, and `CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md` without a user-version bump. diff --git a/CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md b/CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md index db98366..517bcaf 100644 --- a/CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md +++ b/CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md @@ -32,7 +32,7 @@ alembic upgrade head --- ## 🔧 Phase 2: Server-Side Backend Logic -**Status:** 🚧 IN PROGRESS +**Status:** ✅ COMPLETE **Dependencies:** Phase 1 complete **Time estimate:** 2-3 hours @@ -253,7 +253,7 @@ def on_message(client, userdata, message): --- ## 🖥️ Phase 3: Client-Side Implementation -**Status:** ⏳ PENDING (After Phase 2) +**Status:** ✅ COMPLETE **Dependencies:** Phase 2 complete **Time estimate:** 3-4 hours @@ -450,7 +450,7 @@ def send_heartbeat(mqtt_client, uuid): --- ## 🎨 Phase 4: Dashboard UI Integration -**Status:** ⏳ PENDING (After Phase 3) +**Status:** ✅ COMPLETE **Dependencies:** Phases 2 & 3 complete **Time estimate:** 2-3 hours @@ -619,7 +619,7 @@ export const SystemMonitor: React.FC = () => { --- ## 🧪 Phase 5: Testing & Validation -**Status:** ⏳ PENDING +**Status:** ✅ COMPLETE **Dependencies:** All previous phases **Time estimate:** 1-2 hours @@ -739,19 +739,19 @@ export WATCHDOG_ENABLED=true ## ✅ Completion Checklist -- [ ] Phase 1: Database migration applied -- [ ] Phase 2: Listener extended for log topics -- [ ] Phase 2: API endpoints created and tested -- [ ] Phase 3: Client watchdog implemented -- [ ] Phase 3: Enhanced heartbeat deployed -- [ ] Phase 4: Dashboard log viewer working -- [ ] Phase 4: Health indicators visible -- [ ] Phase 5: End-to-end testing complete -- [ ] Documentation updated with new features -- [ ] Production deployment plan created +- [x] Phase 1: Database migration applied +- [x] Phase 2: Listener extended for log topics +- [x] Phase 2: API endpoints created and tested +- [x] Phase 3: Client watchdog implemented +- [x] Phase 3: Enhanced heartbeat deployed +- [x] Phase 4: Dashboard log viewer working +- [x] Phase 4: Health indicators visible +- [x] Phase 5: End-to-end testing complete +- [x] Documentation updated with new features +- [x] Production deployment plan created --- -**Last Updated:** 2026-03-09 +**Last Updated:** 2026-03-24 **Author:** GitHub Copilot **For:** Infoscreen 2025 Project diff --git a/CLIENT_MONITORING_SPECIFICATION.md b/CLIENT_MONITORING_SPECIFICATION.md index dba65c3..7860902 100644 --- a/CLIENT_MONITORING_SPECIFICATION.md +++ b/CLIENT_MONITORING_SPECIFICATION.md @@ -50,6 +50,13 @@ Each infoscreen client must implement health monitoring and logging capabilities └─────────────┘ ``` +### 1.3 Current Compatibility Notes +- The server now accepts both the original specification payloads and the currently implemented Phase 3 client payloads. +- `infoscreen/{uuid}/health` may currently contain a reduced payload with only `expected_state.event_id` and `actual_state.process|pid|status`. Additional `health_metrics` fields from this specification remain recommended. +- `event_id` is still specified as an integer. For compatibility with the current Phase 3 client, the server also tolerates string values such as `event_123` and extracts the numeric suffix where possible. +- If the client sends `process_health` inside `infoscreen/{uuid}/dashboard`, the server treats it as a fallback source for `current_process`, `process_pid`, `process_status`, and `current_event_id`. +- Long term, the preferred client payload remains the structure in this specification so the server can surface richer monitoring data such as screen state and resource metrics. + --- ## 2. MQTT Protocol Specification diff --git a/DEV-CHANGELOG.md b/DEV-CHANGELOG.md index 1007e3c..fbc74aa 100644 --- a/DEV-CHANGELOG.md +++ b/DEV-CHANGELOG.md @@ -5,6 +5,10 @@ This changelog tracks all changes made in the development workspace, including i --- ## Unreleased (development workspace) +- Monitoring system completion: End-to-end monitoring pipeline is active (MQTT logs/health → listener persistence → monitoring APIs → superadmin dashboard). +- Monitoring API: Added/active endpoints `GET /api/client-logs/monitoring-overview` and `GET /api/client-logs/recent-errors`; per-client logs via `GET /api/client-logs//logs`. +- Dashboard monitoring UI: Superadmin monitoring page is integrated and displays client health status, screenshots, process metadata, and recent error activity. +- Bugfix: Presentation flags `page_progress` and `auto_progress` now persist reliably across create/update and detached-occurrence flows. - Frontend (Settings → Events): Added Presentations defaults (slideshow interval, page-progress, auto-progress) with load/save via `/api/system-settings`; UI uses Syncfusion controls. - Backend defaults: Seeded `presentation_interval` ("10"), `presentation_page_progress` ("true"), `presentation_auto_progress` ("true") in `server/init_defaults.py` when missing. - Data model: Added per-event fields `page_progress` and `auto_progress` on `Event`; Alembic migration applied successfully. diff --git a/PHASE_3_CLIENT_MONITORING_IMPLEMENTATION.md b/PHASE_3_CLIENT_MONITORING_IMPLEMENTATION.md new file mode 100644 index 0000000..2b30a95 --- /dev/null +++ b/PHASE_3_CLIENT_MONITORING_IMPLEMENTATION.md @@ -0,0 +1,533 @@ +# Phase 3: Client-Side Monitoring Implementation + +**Status**: ✅ COMPLETE +**Date**: 11. März 2026 +**Architecture**: Two-process design with health-state bridge + +--- + +## Overview + +This document describes the **Phase 3** client-side monitoring implementation integrated into the existing infoscreen-dev codebase. The implementation adds: + +1. ✅ **Health-state tracking** for all display processes (Impressive, Chromium, VLC) +2. ✅ **Tiered logging**: Local rotating logs + selective MQTT transmission +3. ✅ **Process crash detection** with bounded restart attempts +4. ✅ **MQTT health/log topics** feeding the monitoring server +5. ✅ **Impressive-aware process mapping** (presentations → impressive, websites → chromium, videos → vlc) + +--- + +## Architecture + +### Two-Process Design + +``` +┌─────────────────────────────────────────────────────────┐ +│ simclient.py (MQTT Client) │ +│ - Discovers device, sends heartbeat │ +│ - Downloads presentation files │ +│ - Reads health state from display_manager │ +│ - Publishes health/log messages to MQTT │ +│ - Sends screenshots for dashboard │ +└────────┬────────────────────────────────────┬───────────┘ + │ │ + │ reads: current_process_health.json │ + │ │ + │ writes: current_event.json │ + │ │ +┌────────▼────────────────────────────────────▼───────────┐ +│ display_manager.py (Display Control) │ +│ - Monitors events and manages displays │ +│ - Launches Impressive (presentations) │ +│ - Launches Chromium (websites) │ +│ - Launches VLC (videos) │ +│ - Tracks process health and crashes │ +│ - Detects and restarts crashed processes │ +│ - Writes health state to JSON bridge │ +│ - Captures screenshots to shared folder │ +└─────────────────────────────────────────────────────────┘ +``` + +--- + +## Implementation Details + +### 1. Health State Tracking (display_manager.py) + +**File**: `src/display_manager.py` +**New Class**: `ProcessHealthState` + +Tracks process health and persists to JSON for simclient to read: + +```python +class ProcessHealthState: + """Track and persist process health state for monitoring integration""" + + - event_id: Currently active event identifier + - event_type: presentation, website, video, or None + - process_name: impressive, chromium-browser, vlc, or None + - process_pid: Process ID or None for libvlc + - status: running, crashed, starting, stopped + - restart_count: Number of restart attempts + - max_restarts: Maximum allowed restarts (3) +``` + +Methods: +- `update_running()` - Mark process as started (logs to monitoring.log) +- `update_crashed()` - Mark process as crashed (warning to monitoring.log) +- `update_restart_attempt()` - Increment restart counter (logs attempt and checks max) +- `update_stopped()` - Mark process as stopped (info to monitoring.log) +- `save()` - Persist state to `src/current_process_health.json` + +**New Health State File**: `src/current_process_health.json` + +```json +{ + "event_id": "event_123", + "event_type": "presentation", + "current_process": "impressive", + "process_pid": 1234, + "process_status": "running", + "restart_count": 0, + "timestamp": "2026-03-11T10:30:45.123456+00:00" +} +``` + +### 2. Monitoring Logger (both files) + +**Local Rotating Logs**: 5 files × 5 MB each = 25 MB max per device + +**display_manager.py**: +```python +MONITORING_LOG_PATH = "logs/monitoring.log" +monitoring_logger = logging.getLogger("monitoring") +monitoring_handler = RotatingFileHandler(MONITORING_LOG_PATH, maxBytes=5*1024*1024, backupCount=5) +``` + +**simclient.py**: +- Shares same `logs/monitoring.log` file +- Both processes write to monitoring logger for health events +- Local logs never rotate (persisted for technician inspection) + +**Log Filtering** (tiered strategy): +- **ERROR**: Local + MQTT (published to `infoscreen/{uuid}/logs/error`) +- **WARN**: Local + MQTT (published to `infoscreen/{uuid}/logs/warn`) +- **INFO**: Local only (unless `DEBUG_MODE=1`) +- **DEBUG**: Local only (always) + +### 3. Process Mapping with Impressive Support + +**display_manager.py** - When starting processes: + +| Event Type | Process Name | Health Status | +|-----------|--------------|---------------| +| presentation | `impressive` | tracked with PID | +| website/webpage/webuntis | `chromium` or `chromium-browser` | tracked with PID | +| video | `vlc` | tracked (may have no PID if using libvlc) | + +**Per-Process Updates**: +- Presentation: `health.update_running('event_id', 'presentation', 'impressive', pid)` +- Website: `health.update_running('event_id', 'website', browser_name, pid)` +- Video: `health.update_running('event_id', 'video', 'vlc', pid or None)` + +### 4. Crash Detection and Restart Logic + +**display_manager.py** - `process_events()` method: + +``` +If process not running AND same event_id: + ├─ Check exit code + ├─ If presentation with exit code 0: Normal completion (no restart) + ├─ Else: Mark crashed + │ ├─ health.update_crashed() + │ └─ health.update_restart_attempt() + │ ├─ If restart_count > max_restarts: Give up + │ └─ Else: Restart display (loop back to start_display_for_event) + └─ Log to monitoring.log at each step +``` + +**Restart Logic**: +- Max 3 restart attempts per event +- Restarts only if same event still active +- Graceful exit (code 0) for Impressive auto-quit presentations is treated as normal +- All crashes logged to monitoring.log with context + +### 5. MQTT Health and Log Topics + +**simclient.py** - New functions: + +**`read_health_state()`** +- Reads `src/current_process_health.json` written by display_manager +- Returns dict or None if no active process + +**`publish_health_message(client, client_id)`** +- Topic: `infoscreen/{uuid}/health` +- QoS: 1 (reliable) +- Payload: +```json +{ + "timestamp": "2026-03-11T10:30:45.123456+00:00", + "expected_state": { + "event_id": "event_123" + }, + "actual_state": { + "process": "impressive", + "pid": 1234, + "status": "running" + } +} +``` + +**`publish_log_message(client, client_id, level, message, context)`** +- Topics: `infoscreen/{uuid}/logs/error` or `infoscreen/{uuid}/logs/warn` +- QoS: 1 (reliable) +- Log level filtering (only ERROR/WARN sent unless DEBUG_MODE=1) +- Payload: +```json +{ + "timestamp": "2026-03-11T10:30:45.123456+00:00", + "message": "Process started: event_id=123 event_type=presentation process=impressive pid=1234", + "context": { + "event_id": "event_123", + "process": "impressive", + "event_type": "presentation" + } +} +``` + +**Enhanced Dashboard Heartbeat**: +- Topic: `infoscreen/{uuid}/dashboard` +- Now includes `process_health` block with event_id, process name, status, restart count + +### 6. Integration Points + +**Existing Features Preserved**: +- ✅ Impressive PDF presentations with auto-advance and loop +- ✅ Chromium website display with auto-scroll injection +- ✅ VLC video playback (python-vlc preferred, binary fallback) +- ✅ Screenshot capture and transmission +- ✅ HDMI-CEC TV control +- ✅ Two-process architecture + +**New Integration Points**: + +| File | Function | Change | +|------|----------|--------| +| display_manager.py | `__init__()` | Initialize `ProcessHealthState()` | +| display_manager.py | `start_presentation()` | Call `health.update_running()` with impressive | +| display_manager.py | `start_video()` | Call `health.update_running()` with vlc | +| display_manager.py | `start_webpage()` | Call `health.update_running()` with chromium | +| display_manager.py | `process_events()` | Detect crashes, call `health.update_crashed()` and `update_restart_attempt()` | +| display_manager.py | `stop_current_display()` | Call `health.update_stopped()` | +| simclient.py | `screenshot_service_thread()` | (No changes to interval) | +| simclient.py | Main heartbeat loop | Call `publish_health_message()` after successful heartbeat | +| simclient.py | `send_screenshot_heartbeat()` | Read health state and include in dashboard payload | + +--- + +## Logging Hierarchy + +### Local Rotating Files (5 × 5 MB) + +**`logs/display_manager.log`** (existing - updated): +- Display event processing +- Process lifecycle (start/stop) +- HDMI-CEC operations +- Presentation status +- Video/website startup + +**`logs/simclient.log`** (existing - updated): +- MQTT connection/reconnection +- Discovery and heartbeat +- File downloads +- Group membership changes +- Dashboard payload info + +**`logs/monitoring.log`** (NEW): +- Process health events (start, crash, restart, stop) +- Both display_manager and simclient write here +- Centralized health tracking +- Technician-focused: "What happened to the processes?" + +``` +# Example monitoring.log entries: +2026-03-11 10:30:45 [INFO] Process started: event_id=event_123 event_type=presentation process=impressive pid=1234 +2026-03-11 10:35:20 [WARNING] Process crashed: event_id=event_123 event_type=presentation process=impressive restart_count=0/3 +2026-03-11 10:35:20 [WARNING] Restarting process: attempt 1/3 for impressive +2026-03-11 10:35:25 [INFO] Process started: event_id=event_123 event_type=presentation process=impressive pid=1245 +``` + +### MQTT Transmission (Selective) + +**Always sent** (when error occurs): +- `infoscreen/{uuid}/logs/error` - Critical failures +- `infoscreen/{uuid}/logs/warn` - Restarts, crashes, missing binaries + +**Development mode only** (if DEBUG_MODE=1): +- `infoscreen/{uuid}/logs/info` - Event start/stop, process running status + +**Never sent**: +- DEBUG messages (local-only debug details) +- INFO messages in production + +--- + +## Environment Variables + +No new required variables. Existing configuration supports monitoring: + +```bash +# Existing (unchanged): +ENV=development|production +DEBUG_MODE=0|1 # Enables INFO logs to MQTT +LOG_LEVEL=DEBUG|INFO|WARNING|ERROR # Local log verbosity +HEARTBEAT_INTERVAL=5|60 # seconds +SCREENSHOT_INTERVAL=30|300 # seconds (display_manager_screenshot_capture) + +# Recommended for monitoring: +SCREENSHOT_CAPTURE_INTERVAL=30 # How often display_manager captures screenshots +SCREENSHOT_MAX_WIDTH=800 # Downscale for bandwidth +SCREENSHOT_JPEG_QUALITY=70 # Balance quality/size + +# File server (if different from MQTT broker): +FILE_SERVER_HOST=192.168.1.100 +FILE_SERVER_PORT=8000 +FILE_SERVER_SCHEME=http +``` + +--- + +## Testing Validation + +### System-Level Test Sequence + +**1. Start Services**: +```bash +# Terminal 1: Display Manager +./scripts/start-display-manager.sh + +# Terminal 2: MQTT Client +./scripts/start-dev.sh + +# Terminal 3: Monitor logs +tail -f logs/monitoring.log +``` + +**2. Trigger Each Event Type**: +```bash +# Via test menu or MQTT publish: +./scripts/test-display-manager.sh # Options 1-3 trigger events +``` + +**3. Verify Health State File**: +```bash +# Check health state gets written immediately +cat src/current_process_health.json +# Should show: event_id, event_type, current_process (impressive/chromium/vlc), process_status=running +``` + +**4. Check MQTT Topics**: +```bash +# Monitor health messages: +mosquitto_sub -h localhost -t "infoscreen/+/health" -v + +# Monitor log messages: +mosquitto_sub -h localhost -t "infoscreen/+/logs/#" -v + +# Monitor dashboard heartbeat: +mosquitto_sub -h localhost -t "infoscreen/+/dashboard" -v | head -c 500 && echo "..." +``` + +**5. Simulate Process Crash**: +```bash +# Find impressive/chromium/vlc PID: +ps aux | grep -E 'impressive|chromium|vlc' + +# Kill process: +kill -9 + +# Watch monitoring.log for crash detection and restart +tail -f logs/monitoring.log +# Should see: [WARNING] Process crashed... [WARNING] Restarting process... +``` + +**6. Verify Server Integration**: +```bash +# Server receives health messages: +sqlite3 infoscreen.db "SELECT process_status, current_process, restart_count FROM clients WHERE uuid='...';" +# Should show latest status from health message + +# Server receives logs: +sqlite3 infoscreen.db "SELECT level, message FROM client_logs WHERE client_uuid='...' ORDER BY timestamp DESC LIMIT 10;" +# Should show ERROR/WARN entries from crashes/restarts +``` + +--- + +## Troubleshooting + +### Health State File Not Created + +**Symptom**: `src/current_process_health.json` missing +**Causes**: +- No event active (file only created when display starts) +- display_manager not running + +**Check**: +```bash +ps aux | grep display_manager +tail -f logs/display_manager.log | grep "Process started\|Process stopped" +``` + +### MQTT Health Messages Not Arriving + +**Symptom**: No health messages on `infoscreen/{uuid}/health` topic +**Causes**: +- simclient not reading health state file +- MQTT connection dropped +- Health update function not called + +**Check**: +```bash +# Check health file exists and is recent: +ls -l src/current_process_health.json +stat src/current_process_health.json | grep Modify + +# Monitor simclient logs: +tail -f logs/simclient.log | grep -E "Health|heartbeat|publish" + +# Verify MQTT connection: +mosquitto_sub -h localhost -t "infoscreen/+/heartbeat" -v +``` + +### Restart Loop (Process Keeps Crashing) + +**Symptom**: monitoring.log shows repeated crashes and restarts +**Check**: +```bash +# Read last log lines of the process (stored by display_manager): +tail -f logs/impressive.out.log # for presentations +tail -f logs/browser.out.log # for websites +tail -f logs/video_player.out.log # for videos +``` + +**Common Causes**: +- Missing binary (impressive not installed, chromium not found, vlc not available) +- Corrupt presentation file +- Invalid URL for website +- Insufficient permissions for screenshots + +### Log Messages Not Reaching Server + +**Symptom**: client_logs table in server DB is empty +**Causes**: +- Log level filtering: INFO messages in production are local-only +- Logs only published on ERROR/WARN +- MQTT publish failing silently + +**Check**: +```bash +# Force DEBUG_MODE to see all logs: +export DEBUG_MODE=1 +export LOG_LEVEL=DEBUG +# Restart simclient and trigger event + +# Monitor local logs first: +tail -f logs/monitoring.log | grep -i error +``` + +--- + +## Performance Considerations + +**Bandwidth per Client**: +- Health message: ~200 bytes per heartbeat interval (every 5-60s) +- Screenshot heartbeat: ~50-100 KB (every 30-300s) +- Log messages: ~100-500 bytes per crash/error (rare) +- **Total**: ~0.5-2 MB/day per device (very minimal) + +**Disk Space on Client**: +- Monitoring logs: 5 files × 5 MB = 25 MB max +- Display manager logs: 5 files × 2 MB = 10 MB max +- MQTT client logs: 5 files × 2 MB = 10 MB max +- Screenshots: 20 files × 50-100 KB = 1-2 MB max +- **Total**: ~50 MB max (typical for Raspberry Pi USB/SSD) + +**Rotation Strategy**: +- Old files automatically deleted when size limit reached +- Technician can SSH and `tail -f` any time +- No database overhead (file-based rotation is minimal CPU) + +--- + +## Integration with Server (Phase 2) + +The client implementation sends data to the server's Phase 2 endpoints: + +**Expected Server Implementation** (from CLIENT_MONITORING_SETUP.md): + +1. **MQTT Listener** receives and stores: + - `infoscreen/{uuid}/logs/error`, `/logs/warn`, `/logs/info` + - `infoscreen/{uuid}/health` messages + - Updates `clients` table with health fields + +2. **Database Tables**: + - `clients.process_status`: running/crashed/starting/stopped + - `clients.current_process`: impressive/chromium/vlc/None + - `clients.process_pid`: PID value + - `clients.current_event_id`: Active event + - `client_logs`: table stores logs with level/message/context + +3. **API Endpoints**: + - `GET /api/client-logs/{uuid}/logs?level=ERROR&limit=50` + - `GET /api/client-logs/summary` (errors/warnings across all clients) + +--- + +## Summary of Changes + +### Files Modified + +1. **`src/display_manager.py`**: + - Added `psutil` import for future process monitoring + - Added `ProcessHealthState` class (60 lines) + - Added monitoring logger setup (8 lines) + - Added `health.update_running()` calls in `start_presentation()`, `start_video()`, `start_webpage()` + - Added crash detection and restart logic in `process_events()` + - Added `health.update_stopped()` in `stop_current_display()` + +2. **`src/simclient.py`**: + - Added `timezone` import + - Added monitoring logger setup (8 lines) + - Added `read_health_state()` function + - Added `publish_health_message()` function + - Added `publish_log_message()` function (with level filtering) + - Updated `send_screenshot_heartbeat()` to include health data + - Updated heartbeat loop to call `publish_health_message()` + +### Files Created + +1. **`src/current_process_health.json`** (at runtime): + - Bridge file between display_manager and simclient + - Shared volume compatible (works in container setup) + +2. **`logs/monitoring.log`** (at runtime): + - New rotating log file (5 × 5MB) + - Health events from both processes + +--- + +## Next Steps + +1. **Deploy to test client** and run validation sequence above +2. **Deploy server Phase 2** (if not yet done) to receive health/log messages +3. **Verify database updates** in server-side `clients` and `client_logs` tables +4. **Test dashboard UI** (Phase 4) to display health indicators +5. **Configure alerting** (email/Slack) for ERROR level messages + +--- + +**Implementation Date**: 11. März 2026 +**Part of**: Infoscreen 2025 Client Monitoring System +**Status**: Production Ready (with server Phase 2 integration) diff --git a/README.md b/README.md index 1e0bb9b..b7778f1 100644 --- a/README.md +++ b/README.md @@ -225,6 +225,8 @@ For detailed deployment instructions, see: ## Recent changes since last commit +- Monitoring system: End-to-end monitoring is now implemented. The listener ingests `logs/*` and `health` MQTT topics, the API exposes monitoring endpoints (`/api/client-logs/monitoring-overview`, `/api/client-logs/recent-errors`, `/api/client-logs//logs`), and the superadmin dashboard page shows live client status, screenshots, and recent errors. +- Presentation persistence fix: Fixed persistence of presentation flags so `page_progress` and `auto_progress` are reliably stored and returned for create/update flows and detached occurrences. - Video / Streaming support: Added end-to-end support for video events. The API and dashboard now allow creating `video` events referencing uploaded media. The server exposes a range-capable streaming endpoint at `/api/eventmedia/stream//` so clients can seek during playback. - Scheduler metadata: Scheduler now performs a best-effort HEAD probe for video stream URLs and includes basic metadata in the retained MQTT payload: `mime_type`, `size` (bytes) and `accept_ranges` (bool). Placeholders for richer metadata (`duration`, `resolution`, `bitrate`, `qualities`, `thumbnails`, `checksum`) are emitted as null/empty until a background worker fills them. - Dashboard & uploads: The dashboard's FileManager upload limits were increased (to support Full-HD uploads) and client-side validation enforces a maximum video length (10 minutes). The event modal exposes playback flags (`autoplay`, `loop`, `volume`, `muted`) and initializes them from system defaults for new events. @@ -235,7 +237,6 @@ For detailed deployment instructions, see: These changes are designed to be safe if metadata extraction or probes fail — clients should still attempt playback using the provided `url` and fall back to requesting/resolving richer metadata when available. See `MQTT_EVENT_PAYLOAD_GUIDE.md` for details. - - `infoscreen/{uuid}/group_id` - Client group assignment ## 🧩 Developer Environment Notes (Dev Container) - Extensions: UI-only `Dev Containers` runs on the host UI; not installed inside the container to avoid reinstallation loops. See `/.devcontainer/devcontainer.json` (`remote.extensionKind`). @@ -381,6 +382,9 @@ mosquitto_sub -h localhost -t "infoscreen/+/heartbeat" -v ### Health & Monitoring - `GET /health` - Service health check - `GET /api/screenshots/{uuid}.jpg` - Client screenshots +- `GET /api/client-logs/monitoring-overview` - Aggregated monitoring overview for dashboard (superadmin) +- `GET /api/client-logs/recent-errors` - Recent error feed across clients (admin+) +- `GET /api/client-logs/{uuid}/logs` - Filtered per-client logs (admin+) ## 🎨 Frontend Features @@ -444,6 +448,10 @@ mosquitto_sub -h localhost -t "infoscreen/+/heartbeat" -v - Real-time event status: shows currently running events with type, title, and time window - Filters out unassigned groups for focused view - Resource-based Syncfusion timeline scheduler with resize and drag-drop support +- **Monitoring**: Superadmin-only monitoring dashboard + - Live client health states (`healthy`, `warning`, `critical`, `offline`) from heartbeat/process/log data + - Latest screenshot preview and process metadata per client + - System-wide recent error stream and per-client log drill-down - **Program info**: Version, build info, tech stack and paginated changelog (reads `dashboard/public/program-info.json`) ## 🔒 Security & Authentication @@ -474,7 +482,7 @@ mosquitto_sub -h localhost -t "infoscreen/+/heartbeat" -v - MQTT: Pub/sub functionality test - Dashboard: Nginx availability - **Scheduler**: Logging is concise; conversion lookups are cached and logged only once per media. -- Dashboard: Nginx availability +- Monitoring API: `/api/client-logs/monitoring-overview` and `/api/client-logs/recent-errors` for live diagnostics ### Logging Strategy - **Development**: Docker Compose logs with service prefixes diff --git a/TECH-CHANGELOG.md b/TECH-CHANGELOG.md index ea13eed..efdce46 100644 --- a/TECH-CHANGELOG.md +++ b/TECH-CHANGELOG.md @@ -75,10 +75,18 @@ Backend technical work (post-release notes; no version bump): - API endpoints (`server/routes/client_logs.py`): - `GET /api/client-logs//logs` – Retrieve client logs with filters (level, limit, since); authenticated (admin_or_higher) - `GET /api/client-logs/summary` – Get log counts by level per client for last 24h; authenticated (admin_or_higher) + - `GET /api/client-logs/monitoring-overview` – Aggregated monitoring overview for dashboard clients/statuses; authenticated (admin_or_higher) - `GET /api/client-logs/recent-errors` – System-wide error monitoring; authenticated (admin_or_higher) - `GET /api/client-logs/test` – Infrastructure validation endpoint (no auth required) - Blueprint registered in `server/wsgi.py` as `client_logs_bp` - Dev environment fix: Updated `docker-compose.override.yml` listener service to use `working_dir: /workspace` and direct command path for live code reload +- 🖥️ **Monitoring Dashboard Integration (2026-03-24)**: + - Frontend monitoring dashboard (`dashboard/src/monitoring.tsx`) is active and wired to monitoring APIs + - Superadmin-only route/menu integration completed in `dashboard/src/App.tsx` + - Added dashboard monitoring API client (`dashboard/src/apiClientMonitoring.ts`) for overview and recent errors +- 🐛 **Presentation Flags Persistence Fix (2026-03-24)**: + - Fixed persistence for presentation flags `page_progress` and `auto_progress` across create/update and detached-occurrence flows + - API serialization now reliably returns stored values for presentation behavior fields - 📡 **MQTT Protocol Extensions**: - New log topics: `infoscreen/{uuid}/logs/{error|warn|info}` with JSON payload (timestamp, message, context) - New health topic: `infoscreen/{uuid}/health` with metrics (expected_state, actual_state, health_metrics) @@ -95,8 +103,7 @@ Backend technical work (post-release notes; no version bump): Notes for integrators: - Tiered logging strategy: ERROR/WARN always centralized (QoS 1), INFO dev-only (QoS 0), DEBUG local-only -- Client-side implementation pending (Phase 3: watchdog service) -- Dashboard UI pending (Phase 4: log viewer and health indicators) +- Monitoring dashboard is implemented and consumes `/api/client-logs/monitoring-overview`, `/api/client-logs/recent-errors`, and `/api/client-logs//logs` - Foreign key constraint prevents logging for non-existent clients (data integrity enforced) - Migration is idempotent and can be safely rerun after interruption - Use `GET /api/client-logs/test` for quick infrastructure validation without authentication diff --git a/dashboard/src/App.tsx b/dashboard/src/App.tsx index 0734a4e..05b5ffe 100644 --- a/dashboard/src/App.tsx +++ b/dashboard/src/App.tsx @@ -1,5 +1,5 @@ import React, { useState } from 'react'; -import { BrowserRouter as Router, Routes, Route, Link, Outlet, useNavigate } from 'react-router-dom'; +import { BrowserRouter as Router, Routes, Route, Link, Outlet, useNavigate, Navigate } from 'react-router-dom'; import { SidebarComponent } from '@syncfusion/ej2-react-navigations'; import { ButtonComponent } from '@syncfusion/ej2-react-buttons'; import { DropDownButtonComponent } from '@syncfusion/ej2-react-splitbuttons'; @@ -19,6 +19,7 @@ import { Settings, Monitor, MonitorDotIcon, + Activity, LogOut, Wrench, Info, @@ -31,6 +32,7 @@ const sidebarItems = [ { name: 'Ressourcen', path: '/ressourcen', icon: Boxes, minRole: 'editor' }, { name: 'Raumgruppen', path: '/infoscr_groups', icon: MonitorDotIcon, minRole: 'admin' }, { name: 'Infoscreen-Clients', path: '/clients', icon: Monitor, minRole: 'admin' }, + { name: 'Monitor-Dashboard', path: '/monitoring', icon: Activity, minRole: 'superadmin' }, { name: 'Erweiterungsmodus', path: '/setup', icon: Wrench, minRole: 'admin' }, { name: 'Medien', path: '/medien', icon: Image, minRole: 'editor' }, { name: 'Benutzer', path: '/benutzer', icon: User, minRole: 'admin' }, @@ -49,6 +51,7 @@ import Benutzer from './users'; import Einstellungen from './settings'; import SetupMode from './SetupMode'; import Programminfo from './programminfo'; +import MonitoringDashboard from './monitoring'; import Logout from './logout'; import Login from './login'; import { useAuth } from './useAuth'; @@ -480,6 +483,14 @@ const App: React.FC = () => { return <>{children}; }; + const RequireSuperadmin: React.FC<{ children: React.ReactNode }> = ({ children }) => { + const { isAuthenticated, loading, user } = useAuth(); + if (loading) return
Lade ...
; + if (!isAuthenticated) return ; + if (user?.role !== 'superadmin') return ; + return <>{children}; + }; + return ( @@ -499,6 +510,14 @@ const App: React.FC = () => { } /> } /> } /> + + + + } + /> } /> } /> diff --git a/dashboard/src/apiClientMonitoring.ts b/dashboard/src/apiClientMonitoring.ts new file mode 100644 index 0000000..5224278 --- /dev/null +++ b/dashboard/src/apiClientMonitoring.ts @@ -0,0 +1,106 @@ +export interface MonitoringLogEntry { + id: number; + timestamp: string | null; + level: 'ERROR' | 'WARN' | 'INFO' | 'DEBUG' | null; + message: string; + context: Record; + client_uuid?: string; +} + +export interface MonitoringClient { + uuid: string; + hostname?: string | null; + description?: string | null; + ip?: string | null; + model?: string | null; + groupId?: number | null; + groupName?: string | null; + registrationTime?: string | null; + lastAlive?: string | null; + isAlive: boolean; + status: 'healthy' | 'warning' | 'critical' | 'offline'; + currentEventId?: number | null; + currentProcess?: string | null; + processStatus?: string | null; + processPid?: number | null; + screenHealthStatus?: string | null; + lastScreenshotAnalyzed?: string | null; + lastScreenshotHash?: string | null; + screenshotUrl: string; + logCounts24h: { + error: number; + warn: number; + info: number; + debug: number; + }; + latestLog?: MonitoringLogEntry | null; + latestError?: MonitoringLogEntry | null; +} + +export interface MonitoringOverview { + summary: { + totalClients: number; + onlineClients: number; + offlineClients: number; + healthyClients: number; + warningClients: number; + criticalClients: number; + errorLogs: number; + warnLogs: number; + }; + periodHours: number; + gracePeriodSeconds: number; + since: string; + timestamp: string; + clients: MonitoringClient[]; +} + +export interface ClientLogsResponse { + client_uuid: string; + logs: MonitoringLogEntry[]; + count: number; + limit: number; +} + +async function parseJsonResponse(response: Response, fallbackMessage: string): Promise { + const data = await response.json(); + if (!response.ok) { + throw new Error(data.error || fallbackMessage); + } + return data as T; +} + +export async function fetchMonitoringOverview(hours = 24): Promise { + const response = await fetch(`/api/client-logs/monitoring-overview?hours=${hours}`, { + credentials: 'include', + }); + return parseJsonResponse(response, 'Fehler beim Laden der Monitoring-Übersicht'); +} + +export async function fetchRecentClientErrors(limit = 20): Promise { + const response = await fetch(`/api/client-logs/recent-errors?limit=${limit}`, { + credentials: 'include', + }); + const data = await parseJsonResponse<{ errors: MonitoringLogEntry[] }>( + response, + 'Fehler beim Laden der letzten Fehler' + ); + return data.errors; +} + +export async function fetchClientMonitoringLogs( + uuid: string, + options: { level?: string; limit?: number } = {} +): Promise { + const params = new URLSearchParams(); + if (options.level && options.level !== 'ALL') { + params.set('level', options.level); + } + params.set('limit', String(options.limit ?? 100)); + + const response = await fetch(`/api/client-logs/${uuid}/logs?${params.toString()}`, { + credentials: 'include', + }); + const data = await parseJsonResponse(response, 'Fehler beim Laden der Client-Logs'); + return data.logs; +} \ No newline at end of file diff --git a/dashboard/src/monitoring.css b/dashboard/src/monitoring.css new file mode 100644 index 0000000..47746f2 --- /dev/null +++ b/dashboard/src/monitoring.css @@ -0,0 +1,347 @@ +.monitoring-page { + display: flex; + flex-direction: column; + gap: 1.25rem; + padding: 0.5rem 0.25rem 1rem; +} + +.monitoring-header-row { + display: flex; + justify-content: space-between; + align-items: flex-start; + gap: 1rem; + flex-wrap: wrap; +} + +.monitoring-title { + margin: 0; + font-size: 1.75rem; + font-weight: 700; + color: #5c4318; +} + +.monitoring-subtitle { + margin: 0.35rem 0 0; + color: #6b7280; + max-width: 60ch; +} + +.monitoring-toolbar { + display: flex; + align-items: end; + gap: 0.75rem; + flex-wrap: wrap; +} + +.monitoring-toolbar-field { + display: flex; + flex-direction: column; + gap: 0.35rem; + min-width: 190px; +} + +.monitoring-toolbar-field-compact { + min-width: 160px; +} + +.monitoring-toolbar-field label { + font-size: 0.875rem; + font-weight: 600; + color: #5b4b32; +} + +.monitoring-meta-row { + display: flex; + gap: 1rem; + flex-wrap: wrap; + color: #6b7280; + font-size: 0.92rem; +} + +.monitoring-summary-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); + gap: 1rem; +} + +.monitoring-metric-card { + overflow: hidden; +} + +.monitoring-metric-content { + display: flex; + flex-direction: column; + gap: 0.35rem; +} + +.monitoring-metric-title { + font-size: 0.9rem; + font-weight: 600; + color: #6b7280; +} + +.monitoring-metric-value { + font-size: 2rem; + font-weight: 700; + color: #1f2937; + line-height: 1; +} + +.monitoring-metric-subtitle { + font-size: 0.85rem; + color: #64748b; +} + +.monitoring-main-grid { + display: grid; + grid-template-columns: minmax(0, 2fr) minmax(320px, 1fr); + gap: 1rem; + align-items: start; +} + +.monitoring-sidebar-column { + display: flex; + flex-direction: column; + gap: 1rem; +} + +.monitoring-panel { + background: #fff; + border: 1px solid #e5e7eb; + border-radius: 16px; + padding: 1.1rem; + box-shadow: 0 12px 40px rgb(120 89 28 / 8%); +} + +.monitoring-clients-panel { + min-width: 0; +} + +.monitoring-panel-header { + display: flex; + justify-content: space-between; + align-items: center; + gap: 0.75rem; + margin-bottom: 0.85rem; +} + +.monitoring-panel-header-stacked { + align-items: end; + flex-wrap: wrap; +} + +.monitoring-panel-header h3 { + margin: 0; + font-size: 1.1rem; + font-weight: 700; +} + +.monitoring-panel-header span { + color: #6b7280; + font-size: 0.9rem; +} + +.monitoring-detail-card .e-card-content { + padding-top: 0; +} + +.monitoring-detail-list { + display: flex; + flex-direction: column; + gap: 0.75rem; +} + +.monitoring-detail-row { + display: flex; + justify-content: space-between; + gap: 1rem; + align-items: flex-start; + border-bottom: 1px solid #f1f5f9; + padding-bottom: 0.55rem; +} + +.monitoring-detail-row span { + color: #64748b; + font-size: 0.9rem; +} + +.monitoring-detail-row strong { + text-align: right; + color: #111827; +} + +.monitoring-status-badge { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 0.22rem 0.6rem; + border-radius: 999px; + font-weight: 700; + font-size: 0.78rem; + letter-spacing: 0.01em; +} + +.monitoring-screenshot { + width: 100%; + border-radius: 12px; + border: 1px solid #e5e7eb; + background: linear-gradient(135deg, #f8fafc, #e2e8f0); + min-height: 180px; + object-fit: cover; +} + +.monitoring-screenshot-meta { + margin-top: 0.55rem; + font-size: 0.88rem; + color: #64748b; +} + +.monitoring-error-box { + display: flex; + flex-direction: column; + gap: 0.5rem; + padding: 0.85rem; + border-radius: 12px; + background: linear-gradient(135deg, #fff1f2, #fee2e2); + border: 1px solid #fecdd3; +} + +.monitoring-error-time { + color: #9f1239; + font-size: 0.85rem; + font-weight: 600; +} + +.monitoring-error-message { + color: #4c0519; + font-weight: 600; +} + +.monitoring-mono { + font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; + font-size: 0.85rem; +} + +.monitoring-log-detail-row { + display: flex; + justify-content: space-between; + gap: 1rem; + align-items: flex-start; + border-bottom: 1px solid #f1f5f9; + padding-bottom: 0.55rem; +} + +.monitoring-log-detail-row span { + color: #64748b; + font-size: 0.9rem; +} + +.monitoring-log-detail-row strong { + text-align: right; + color: #111827; +} + +.monitoring-log-context { + margin: 0; + background: #f8fafc; + border: 1px solid #e2e8f0; + border-radius: 10px; + padding: 0.75rem; + white-space: pre-wrap; + overflow-wrap: anywhere; + max-height: 280px; + overflow: auto; + font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; + font-size: 0.84rem; + color: #0f172a; +} + +.monitoring-log-dialog-content { + display: flex; + flex-direction: column; + gap: 1rem; + padding: 0.9rem 1rem 0.55rem; +} + +.monitoring-log-dialog-body { + min-height: 340px; + display: flex; + flex-direction: column; + justify-content: space-between; +} + +.monitoring-log-dialog-actions { + margin-top: 0.5rem; + padding: 0 1rem 0.9rem; + display: flex; + justify-content: flex-end; +} + +.monitoring-log-context-title { + font-weight: 600; + margin-bottom: 0.55rem; +} + +.monitoring-log-dialog-content .monitoring-log-detail-row { + padding: 0.1rem 0 0.75rem; +} + +.monitoring-log-dialog-content .monitoring-log-context { + padding: 0.95rem; + border-radius: 12px; +} + +.monitoring-lower-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 1rem; +} + +@media (width <= 1200px) { + .monitoring-main-grid, + .monitoring-lower-grid { + grid-template-columns: 1fr; + } +} + +@media (width <= 720px) { + .monitoring-page { + padding: 0.25rem 0 0.75rem; + } + + .monitoring-title { + font-size: 1.5rem; + } + + .monitoring-header-row, + .monitoring-panel-header, + .monitoring-detail-row, + .monitoring-log-detail-row { + flex-direction: column; + align-items: flex-start; + } + + .monitoring-detail-row strong, + .monitoring-log-detail-row strong { + text-align: left; + } + + .monitoring-toolbar, + .monitoring-toolbar-field, + .monitoring-toolbar-field-compact { + width: 100%; + } + + .monitoring-log-dialog-content { + padding: 0.4rem 0.2rem 0.1rem; + gap: 0.75rem; + } + + .monitoring-log-dialog-body { + min-height: 300px; + } + + .monitoring-log-dialog-actions { + padding: 0 0.2rem 0.4rem; + } +} \ No newline at end of file diff --git a/dashboard/src/monitoring.tsx b/dashboard/src/monitoring.tsx new file mode 100644 index 0000000..3c81ad5 --- /dev/null +++ b/dashboard/src/monitoring.tsx @@ -0,0 +1,534 @@ +import React from 'react'; +import { + fetchClientMonitoringLogs, + fetchMonitoringOverview, + fetchRecentClientErrors, + type MonitoringClient, + type MonitoringLogEntry, + type MonitoringOverview, +} from './apiClientMonitoring'; +import { useAuth } from './useAuth'; +import { ButtonComponent } from '@syncfusion/ej2-react-buttons'; +import { DropDownListComponent } from '@syncfusion/ej2-react-dropdowns'; +import { + GridComponent, + ColumnsDirective, + ColumnDirective, + Inject, + Page, + Search, + Sort, + Toolbar, +} from '@syncfusion/ej2-react-grids'; +import { MessageComponent } from '@syncfusion/ej2-react-notifications'; +import { DialogComponent } from '@syncfusion/ej2-react-popups'; +import './monitoring.css'; + +const REFRESH_INTERVAL_MS = 15000; + +const hourOptions = [ + { text: 'Letzte 6 Stunden', value: 6 }, + { text: 'Letzte 24 Stunden', value: 24 }, + { text: 'Letzte 72 Stunden', value: 72 }, + { text: 'Letzte 168 Stunden', value: 168 }, +]; + +const logLevelOptions = [ + { text: 'Alle Logs', value: 'ALL' }, + { text: 'ERROR', value: 'ERROR' }, + { text: 'WARN', value: 'WARN' }, + { text: 'INFO', value: 'INFO' }, + { text: 'DEBUG', value: 'DEBUG' }, +]; + +const statusPalette: Record = { + healthy: { label: 'Stabil', color: '#166534', background: '#dcfce7' }, + warning: { label: 'Warnung', color: '#92400e', background: '#fef3c7' }, + critical: { label: 'Kritisch', color: '#991b1b', background: '#fee2e2' }, + offline: { label: 'Offline', color: '#334155', background: '#e2e8f0' }, +}; + +function parseUtcDate(value?: string | null): Date | null { + if (!value) return null; + const trimmed = value.trim(); + if (!trimmed) return null; + + const hasTimezone = /[zZ]$|[+-]\d{2}:?\d{2}$/.test(trimmed); + const utcValue = hasTimezone ? trimmed : `${trimmed}Z`; + const parsed = new Date(utcValue); + if (Number.isNaN(parsed.getTime())) return null; + return parsed; +} + +function formatTimestamp(value?: string | null): string { + if (!value) return 'Keine Daten'; + const date = parseUtcDate(value); + if (!date) return value; + return date.toLocaleString('de-DE'); +} + +function formatRelative(value?: string | null): string { + if (!value) return 'Keine Daten'; + const date = parseUtcDate(value); + if (!date) return 'Unbekannt'; + + const diffMs = Date.now() - date.getTime(); + const diffMinutes = Math.floor(diffMs / 60000); + const diffHours = Math.floor(diffMinutes / 60); + const diffDays = Math.floor(diffHours / 24); + + if (diffMinutes < 1) return 'gerade eben'; + if (diffMinutes < 60) return `vor ${diffMinutes} Min.`; + if (diffHours < 24) return `vor ${diffHours} Std.`; + return `vor ${diffDays} Tag${diffDays === 1 ? '' : 'en'}`; +} + +function statusBadge(status: string) { + const palette = statusPalette[status] || statusPalette.offline; + return ( + + {palette.label} + + ); +} + +function renderMetricCard(title: string, value: number, subtitle: string, accent: string) { + return ( +
+
+
{title}
+
{value}
+
{subtitle}
+
+
+ ); +} + +function renderContext(context?: Record): string { + if (!context || Object.keys(context).length === 0) { + return 'Kein Kontext vorhanden'; + } + try { + return JSON.stringify(context, null, 2); + } catch { + return 'Kontext konnte nicht formatiert werden'; + } +} + +function buildScreenshotUrl(client: MonitoringClient, overviewTimestamp?: string | null): string { + const refreshKey = client.lastScreenshotHash || client.lastScreenshotAnalyzed || overviewTimestamp; + if (!refreshKey) { + return client.screenshotUrl; + } + + const separator = client.screenshotUrl.includes('?') ? '&' : '?'; + return `${client.screenshotUrl}${separator}v=${encodeURIComponent(refreshKey)}`; +} + +const MonitoringDashboard: React.FC = () => { + const { user } = useAuth(); + const [hours, setHours] = React.useState(24); + const [logLevel, setLogLevel] = React.useState('ALL'); + const [overview, setOverview] = React.useState(null); + const [recentErrors, setRecentErrors] = React.useState([]); + const [clientLogs, setClientLogs] = React.useState([]); + const [selectedClientUuid, setSelectedClientUuid] = React.useState(null); + const [loading, setLoading] = React.useState(true); + const [error, setError] = React.useState(null); + const [logsLoading, setLogsLoading] = React.useState(false); + const [screenshotErrored, setScreenshotErrored] = React.useState(false); + const selectedClientUuidRef = React.useRef(null); + const [selectedLogEntry, setSelectedLogEntry] = React.useState(null); + + const selectedClient = React.useMemo(() => { + if (!overview || !selectedClientUuid) return null; + return overview.clients.find(client => client.uuid === selectedClientUuid) || null; + }, [overview, selectedClientUuid]); + + const selectedClientScreenshotUrl = React.useMemo(() => { + if (!selectedClient) return null; + return buildScreenshotUrl(selectedClient, overview?.timestamp || null); + }, [selectedClient, overview?.timestamp]); + + React.useEffect(() => { + selectedClientUuidRef.current = selectedClientUuid; + }, [selectedClientUuid]); + + const loadOverview = React.useCallback(async (requestedHours: number, preserveSelection = true) => { + setLoading(true); + setError(null); + try { + const [overviewData, errorsData] = await Promise.all([ + fetchMonitoringOverview(requestedHours), + fetchRecentClientErrors(25), + ]); + setOverview(overviewData); + setRecentErrors(errorsData); + + const currentSelection = selectedClientUuidRef.current; + const nextSelectedUuid = + preserveSelection && currentSelection && overviewData.clients.some(client => client.uuid === currentSelection) + ? currentSelection + : overviewData.clients[0]?.uuid || null; + + setSelectedClientUuid(nextSelectedUuid); + setScreenshotErrored(false); + } catch (loadError) { + setError(loadError instanceof Error ? loadError.message : 'Monitoring-Daten konnten nicht geladen werden'); + } finally { + setLoading(false); + } + }, []); + + React.useEffect(() => { + loadOverview(hours, false); + }, [hours, loadOverview]); + + React.useEffect(() => { + const intervalId = window.setInterval(() => { + loadOverview(hours); + }, REFRESH_INTERVAL_MS); + + return () => window.clearInterval(intervalId); + }, [hours, loadOverview]); + + React.useEffect(() => { + if (!selectedClientUuid) { + setClientLogs([]); + return; + } + + let active = true; + const loadLogs = async () => { + setLogsLoading(true); + try { + const logs = await fetchClientMonitoringLogs(selectedClientUuid, { level: logLevel, limit: 100 }); + if (active) { + setClientLogs(logs); + } + } catch (loadError) { + if (active) { + setClientLogs([]); + setError(loadError instanceof Error ? loadError.message : 'Client-Logs konnten nicht geladen werden'); + } + } finally { + if (active) { + setLogsLoading(false); + } + } + }; + + loadLogs(); + return () => { + active = false; + }; + }, [selectedClientUuid, logLevel]); + + React.useEffect(() => { + setScreenshotErrored(false); + }, [selectedClientUuid]); + + if (!user || user.role !== 'superadmin') { + return ( + + ); + } + + const clientGridData = (overview?.clients || []).map(client => ({ + ...client, + displayName: client.description || client.hostname || client.uuid, + lastAliveDisplay: formatTimestamp(client.lastAlive), + currentProcessDisplay: client.currentProcess || 'kein Prozess', + processStatusDisplay: client.processStatus || 'unbekannt', + errorCount: client.logCounts24h.error, + warnCount: client.logCounts24h.warn, + })); + + return ( +
+
+
+

Monitor-Dashboard

+

+ Live-Zustand der Infoscreen-Clients, Prozessstatus und zentrale Fehlerprotokolle. +

+
+
+
+ + setHours(Number(args.value))} + /> +
+ loadOverview(hours)} disabled={loading}> + Aktualisieren + +
+
+ + {error && } + + {overview && ( +
+ Stand: {formatTimestamp(overview.timestamp)} + Alive-Fenster: {overview.gracePeriodSeconds} Sekunden + Betrachtungszeitraum: {overview.periodHours} Stunden +
+ )} + +
+ {renderMetricCard('Clients gesamt', overview?.summary.totalClients || 0, 'Registrierte Displays', '#7c3aed')} + {renderMetricCard('Online', overview?.summary.onlineClients || 0, 'Heartbeat innerhalb der Grace-Periode', '#15803d')} + {renderMetricCard('Warnungen', overview?.summary.warningClients || 0, 'Warn-Logs oder Übergangszustände', '#d97706')} + {renderMetricCard('Kritisch', overview?.summary.criticalClients || 0, 'Crashs oder Fehler-Logs', '#dc2626')} + {renderMetricCard('Offline', overview?.summary.offlineClients || 0, 'Keine frischen Signale', '#475569')} + {renderMetricCard('Fehler-Logs', overview?.summary.errorLogs || 0, 'Im gewählten Zeitraum', '#b91c1c')} +
+ + {loading && !overview ? ( + + ) : ( +
+
+
+

Client-Zustand

+ {overview?.clients.length || 0} Einträge +
+ { + setSelectedClientUuid(args.data.uuid); + }} + > + + statusBadge(props.status)} + /> + + + + + + + + + + +
+ +
+
+
+
+
Aktiver Client
+
+
+
+ {selectedClient ? ( +
+
+ Name + {selectedClient.description || selectedClient.hostname || selectedClient.uuid} +
+
+ Status + {statusBadge(selectedClient.status)} +
+
+ UUID + {selectedClient.uuid} +
+
+ Raumgruppe + {selectedClient.groupName || 'Nicht zugeordnet'} +
+
+ Prozess + {selectedClient.currentProcess || 'kein Prozess'} +
+
+ PID + {selectedClient.processPid || 'keine PID'} +
+
+ Event-ID + {selectedClient.currentEventId || 'keine Zuordnung'} +
+
+ Letztes Signal + {formatRelative(selectedClient.lastAlive)} +
+
+ Bildschirmstatus + {selectedClient.screenHealthStatus || 'UNKNOWN'} +
+
+ Letzte Analyse + {formatTimestamp(selectedClient.lastScreenshotAnalyzed)} +
+
+ ) : ( + + )} +
+
+ +
+
+
+
Der letzte Screenshot
+
+
+
+ {selectedClient ? ( + <> + {screenshotErrored ? ( + + ) : ( + {`Screenshot setScreenshotErrored(true)} + /> + )} +
+ Empfangen: {formatTimestamp(selectedClient.lastScreenshotAnalyzed)} +
+ + ) : ( + + )} +
+
+ +
+
+
+
Letzter Fehler
+
+
+
+ {selectedClient?.latestError ? ( +
+
{formatTimestamp(selectedClient.latestError.timestamp)}
+
{selectedClient.latestError.message}
+
+ ) : ( + + )} +
+
+
+
+ )} + +
+
+
+
+

Client-Logs

+ {selectedClient ? `Client ${selectedClient.uuid}` : 'Kein Client ausgewählt'} +
+
+ + setLogLevel(String(args.value))} + /> +
+
+ {logsLoading && } + { + setSelectedLogEntry(args.data); + }} + > + + formatTimestamp(props.timestamp)} /> + + + + + +
+ +
+
+

Letzte Fehler systemweit

+ {recentErrors.length} Einträge +
+ + + formatTimestamp(props.timestamp)} /> + + + + + +
+
+ + setSelectedLogEntry(null)} + > + {selectedLogEntry && ( +
+
+
+ Zeit + {formatTimestamp(selectedLogEntry.timestamp)} +
+
+ Level + {selectedLogEntry.level || 'Unbekannt'} +
+
+ Nachricht + {selectedLogEntry.message} +
+
+
Kontext
+
{renderContext(selectedLogEntry.context)}
+
+
+
+ setSelectedLogEntry(null)}>Schließen +
+
+ )} +
+
+ ); +}; + +export default MonitoringDashboard; \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 851401a..0c1b6f6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,8 +19,8 @@ services: - DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} - DB_URL=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} - API_BASE_URL=http://server:8000 - - ENV=${ENV:-development} - - FLASK_SECRET_KEY=${FLASK_SECRET_KEY:-dev-secret-key-change-in-production} + - ENV=${ENV:-development} + - FLASK_SECRET_KEY=${FLASK_SECRET_KEY:-dev-secret-key-change-in-production} - DEFAULT_SUPERADMIN_USERNAME=${DEFAULT_SUPERADMIN_USERNAME:-superadmin} - DEFAULT_SUPERADMIN_PASSWORD=${DEFAULT_SUPERADMIN_PASSWORD} # 🔧 ENTFERNT: Volume-Mount ist nur für die Entwicklung diff --git a/listener/listener.py b/listener/listener.py index b76eb9c..7d5fc00 100644 --- a/listener/listener.py +++ b/listener/listener.py @@ -3,15 +3,17 @@ import json import logging import datetime import base64 +import re import requests import paho.mqtt.client as mqtt from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from models.models import Client, ClientLog, LogLevel, ProcessStatus +from models.models import Client, ClientLog, LogLevel, ProcessStatus, ScreenHealthStatus logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s') -# Load .env in development -if os.getenv("ENV", "development") == "development": +# Load .env only when not already configured by Docker (API_BASE_URL not set by compose means we're outside a container) +_api_already_set = bool(os.environ.get("API_BASE_URL")) +if not _api_already_set and os.getenv("ENV", "development") == "development": try: from dotenv import load_dotenv load_dotenv(".env") @@ -31,6 +33,161 @@ Session = sessionmaker(bind=engine) API_BASE_URL = os.getenv("API_BASE_URL", "http://server:8000") +def normalize_process_status(value): + if value is None: + return None + if isinstance(value, ProcessStatus): + return value + + normalized = str(value).strip().lower() + if not normalized: + return None + + try: + return ProcessStatus(normalized) + except ValueError: + return None + + +def normalize_event_id(value): + if value is None or isinstance(value, bool): + return None + if isinstance(value, int): + return value + if isinstance(value, float): + return int(value) + + normalized = str(value).strip() + if not normalized: + return None + if normalized.isdigit(): + return int(normalized) + + match = re.search(r"(\d+)$", normalized) + if match: + return int(match.group(1)) + + return None + + +def parse_timestamp(value): + if not value: + return None + if isinstance(value, (int, float)): + try: + ts_value = float(value) + if ts_value > 1e12: + ts_value = ts_value / 1000.0 + return datetime.datetime.fromtimestamp(ts_value, datetime.UTC) + except (TypeError, ValueError, OverflowError): + return None + try: + value_str = str(value).strip() + if value_str.isdigit(): + ts_value = float(value_str) + if ts_value > 1e12: + ts_value = ts_value / 1000.0 + return datetime.datetime.fromtimestamp(ts_value, datetime.UTC) + + parsed = datetime.datetime.fromisoformat(value_str.replace('Z', '+00:00')) + if parsed.tzinfo is None: + return parsed.replace(tzinfo=datetime.UTC) + return parsed.astimezone(datetime.UTC) + except ValueError: + return None + + +def infer_screen_health_status(payload_data): + explicit = payload_data.get('screen_health_status') + if explicit: + try: + return ScreenHealthStatus[str(explicit).strip().upper()] + except KeyError: + pass + + metrics = payload_data.get('health_metrics') or {} + if metrics.get('screen_on') is False: + return ScreenHealthStatus.BLACK + + last_frame_update = parse_timestamp(metrics.get('last_frame_update')) + if last_frame_update: + age_seconds = (datetime.datetime.now(datetime.UTC) - last_frame_update).total_seconds() + if age_seconds > 30: + return ScreenHealthStatus.FROZEN + return ScreenHealthStatus.OK + + return None + + +def apply_monitoring_update(client_obj, *, event_id=None, process_name=None, process_pid=None, + process_status=None, last_seen=None, screen_health_status=None, + last_screenshot_analyzed=None): + if last_seen: + client_obj.last_alive = last_seen + + normalized_event_id = normalize_event_id(event_id) + if normalized_event_id is not None: + client_obj.current_event_id = normalized_event_id + + if process_name is not None: + client_obj.current_process = process_name + + if process_pid is not None: + client_obj.process_pid = process_pid + + normalized_status = normalize_process_status(process_status) + if normalized_status is not None: + client_obj.process_status = normalized_status + + if screen_health_status is not None: + client_obj.screen_health_status = screen_health_status + + if last_screenshot_analyzed is not None: + existing = client_obj.last_screenshot_analyzed + if existing is not None and existing.tzinfo is None: + existing = existing.replace(tzinfo=datetime.UTC) + + candidate = last_screenshot_analyzed + if candidate.tzinfo is None: + candidate = candidate.replace(tzinfo=datetime.UTC) + + if existing is None or candidate >= existing: + client_obj.last_screenshot_analyzed = candidate + + +def _extract_image_and_timestamp(data): + image_value = None + timestamp_value = None + + if not isinstance(data, dict): + return None, None + + screenshot_obj = data.get("screenshot") if isinstance(data.get("screenshot"), dict) else None + metadata_obj = data.get("metadata") if isinstance(data.get("metadata"), dict) else None + screenshot_meta_obj = screenshot_obj.get("metadata") if screenshot_obj and isinstance(screenshot_obj.get("metadata"), dict) else None + + for key in ("image", "data"): + if isinstance(data.get(key), str) and data.get(key): + image_value = data.get(key) + break + if image_value is None and screenshot_obj is not None: + for key in ("image", "data"): + if isinstance(screenshot_obj.get(key), str) and screenshot_obj.get(key): + image_value = screenshot_obj.get(key) + break + + for container in (data, screenshot_obj, metadata_obj, screenshot_meta_obj): + if not isinstance(container, dict): + continue + for key in ("timestamp", "captured_at", "capture_time", "created_at"): + value = container.get(key) + if value is not None: + timestamp_value = value + return image_value, timestamp_value + + return image_value, timestamp_value + + def handle_screenshot(uuid, payload): """ Handle screenshot data received via MQTT and forward to API. @@ -40,13 +197,16 @@ def handle_screenshot(uuid, payload): # Try to parse as JSON first try: data = json.loads(payload.decode()) - if "image" in data: + image_b64, timestamp_value = _extract_image_and_timestamp(data) + if image_b64: # Payload is JSON with base64 image - api_payload = {"image": data["image"]} + api_payload = {"image": image_b64} + if timestamp_value is not None: + api_payload["timestamp"] = timestamp_value headers = {"Content-Type": "application/json"} logging.debug(f"Forwarding base64 screenshot from {uuid} to API") else: - logging.warning(f"Screenshot JSON from {uuid} missing 'image' field") + logging.warning(f"Screenshot JSON from {uuid} missing image/data field") return except (json.JSONDecodeError, UnicodeDecodeError): # Payload is raw binary image data - encode to base64 for API @@ -101,21 +261,28 @@ def on_message(client, userdata, msg): try: payload_text = msg.payload.decode() data = json.loads(payload_text) - shot = data.get("screenshot") - if isinstance(shot, dict): - # Prefer 'data' field (base64) inside screenshot object - image_b64 = shot.get("data") - if image_b64: - logging.debug(f"Dashboard enthält Screenshot für {uuid}; Weiterleitung an API") - # Build a lightweight JSON with image field for API handler - api_payload = json.dumps({"image": image_b64}).encode("utf-8") - handle_screenshot(uuid, api_payload) + image_b64, ts_value = _extract_image_and_timestamp(data) + if image_b64: + logging.debug(f"Dashboard enthält Screenshot für {uuid}; Weiterleitung an API") + dashboard_payload = {"image": image_b64} + if ts_value is not None: + dashboard_payload["timestamp"] = ts_value + api_payload = json.dumps(dashboard_payload).encode("utf-8") + handle_screenshot(uuid, api_payload) # Update last_alive if status present if data.get("status") == "alive": session = Session() client_obj = session.query(Client).filter_by(uuid=uuid).first() if client_obj: - client_obj.last_alive = datetime.datetime.now(datetime.UTC) + process_health = data.get('process_health') or {} + apply_monitoring_update( + client_obj, + last_seen=datetime.datetime.now(datetime.UTC), + event_id=process_health.get('event_id'), + process_name=process_health.get('current_process') or process_health.get('process'), + process_pid=process_health.get('process_pid') or process_health.get('pid'), + process_status=process_health.get('process_status') or process_health.get('status'), + ) session.commit() session.close() except Exception as e: @@ -140,24 +307,14 @@ def on_message(client, userdata, msg): session = Session() client_obj = session.query(Client).filter_by(uuid=uuid).first() if client_obj: - client_obj.last_alive = datetime.datetime.now(datetime.UTC) - - # Update health fields if present in heartbeat - if 'process_status' in payload_data: - try: - client_obj.process_status = ProcessStatus[payload_data['process_status']] - except (KeyError, TypeError): - pass - - if 'current_process' in payload_data: - client_obj.current_process = payload_data.get('current_process') - - if 'process_pid' in payload_data: - client_obj.process_pid = payload_data.get('process_pid') - - if 'current_event_id' in payload_data: - client_obj.current_event_id = payload_data.get('current_event_id') - + apply_monitoring_update( + client_obj, + last_seen=datetime.datetime.now(datetime.UTC), + event_id=payload_data.get('current_event_id'), + process_name=payload_data.get('current_process'), + process_pid=payload_data.get('process_pid'), + process_status=payload_data.get('process_status'), + ) session.commit() logging.info(f"Heartbeat von {uuid} empfangen, last_alive (UTC) aktualisiert.") session.close() @@ -222,23 +379,20 @@ def on_message(client, userdata, msg): if client_obj: # Update expected state expected = payload_data.get('expected_state', {}) - if 'event_id' in expected: - client_obj.current_event_id = expected['event_id'] - + # Update actual state actual = payload_data.get('actual_state', {}) - if 'process' in actual: - client_obj.current_process = actual['process'] - - if 'pid' in actual: - client_obj.process_pid = actual['pid'] - - if 'status' in actual: - try: - client_obj.process_status = ProcessStatus[actual['status']] - except (KeyError, TypeError): - pass - + screen_health_status = infer_screen_health_status(payload_data) + apply_monitoring_update( + client_obj, + last_seen=datetime.datetime.now(datetime.UTC), + event_id=expected.get('event_id'), + process_name=actual.get('process'), + process_pid=actual.get('pid'), + process_status=actual.get('status'), + screen_health_status=screen_health_status, + last_screenshot_analyzed=parse_timestamp((payload_data.get('health_metrics') or {}).get('last_frame_update')), + ) session.commit() logging.debug(f"Health update from {uuid}: {actual.get('process')} ({actual.get('status')})") session.close() diff --git a/server/routes/client_logs.py b/server/routes/client_logs.py index c3df644..d0fe447 100644 --- a/server/routes/client_logs.py +++ b/server/routes/client_logs.py @@ -1,14 +1,95 @@ from flask import Blueprint, jsonify, request from server.database import Session -from server.permissions import admin_or_higher -from models.models import ClientLog, Client, LogLevel +from server.permissions import admin_or_higher, superadmin_only +from models.models import ClientLog, Client, ClientGroup, LogLevel from sqlalchemy import desc, func from datetime import datetime, timedelta, timezone import json +import os +import glob + +from server.serializers import dict_to_camel_case client_logs_bp = Blueprint("client_logs", __name__, url_prefix="/api/client-logs") +def _grace_period_seconds(): + env = os.environ.get("ENV", "production").lower() + if env in ("development", "dev"): + return int(os.environ.get("HEARTBEAT_GRACE_PERIOD_DEV", "180")) + return int(os.environ.get("HEARTBEAT_GRACE_PERIOD_PROD", "170")) + + +def _to_utc(dt): + if dt is None: + return None + if dt.tzinfo is None: + return dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + + +def _is_client_alive(last_alive, is_active): + if not last_alive or not is_active: + return False + return (datetime.now(timezone.utc) - _to_utc(last_alive)) <= timedelta(seconds=_grace_period_seconds()) + + +def _safe_context(raw_context): + if not raw_context: + return {} + try: + return json.loads(raw_context) + except (TypeError, json.JSONDecodeError): + return {"raw": raw_context} + + +def _serialize_log_entry(log, include_client_uuid=False): + if not log: + return None + + entry = { + "id": log.id, + "timestamp": log.timestamp.isoformat() if log.timestamp else None, + "level": log.level.value if log.level else None, + "message": log.message, + "context": _safe_context(log.context), + } + if include_client_uuid: + entry["client_uuid"] = log.client_uuid + return entry + + +def _determine_client_status(is_alive, process_status, screen_health_status, log_counts): + if not is_alive: + return "offline" + if process_status == "crashed" or screen_health_status in ("BLACK", "FROZEN"): + return "critical" + if log_counts.get("ERROR", 0) > 0: + return "critical" + if process_status in ("starting", "stopped") or log_counts.get("WARN", 0) > 0: + return "warning" + return "healthy" + + +def _infer_last_screenshot_ts(client_uuid): + screenshots_dir = os.path.join(os.path.dirname(__file__), "..", "screenshots") + + candidate_files = [] + latest_file = os.path.join(screenshots_dir, f"{client_uuid}.jpg") + if os.path.exists(latest_file): + candidate_files.append(latest_file) + + candidate_files.extend(glob.glob(os.path.join(screenshots_dir, f"{client_uuid}_*.jpg"))) + if not candidate_files: + return None + + try: + newest_path = max(candidate_files, key=os.path.getmtime) + return datetime.fromtimestamp(os.path.getmtime(newest_path), timezone.utc) + except Exception: + return None + + @client_logs_bp.route("/test", methods=["GET"]) def test_client_logs(): """Test endpoint to verify logging infrastructure (no auth required)""" @@ -107,22 +188,7 @@ def get_client_logs(uuid): # Format results result = [] for log in logs: - entry = { - "id": log.id, - "timestamp": log.timestamp.isoformat() if log.timestamp else None, - "level": log.level.value if log.level else None, - "message": log.message, - "context": {} - } - - # Parse context JSON - if log.context: - try: - entry["context"] = json.loads(log.context) - except json.JSONDecodeError: - entry["context"] = {"raw": log.context} - - result.append(entry) + result.append(_serialize_log_entry(log)) session.close() return jsonify({ @@ -212,6 +278,141 @@ def get_logs_summary(): return jsonify({"error": f"Server error: {str(e)}"}), 500 +@client_logs_bp.route("/monitoring-overview", methods=["GET"]) +@superadmin_only +def get_monitoring_overview(): + """Return a dashboard-friendly monitoring overview for all clients.""" + session = Session() + try: + hours = min(int(request.args.get("hours", 24)), 168) + since = datetime.now(timezone.utc) - timedelta(hours=hours) + + clients = ( + session.query(Client, ClientGroup.name.label("group_name")) + .outerjoin(ClientGroup, Client.group_id == ClientGroup.id) + .order_by(ClientGroup.name.asc(), Client.description.asc(), Client.hostname.asc(), Client.uuid.asc()) + .all() + ) + + log_stats = ( + session.query( + ClientLog.client_uuid, + ClientLog.level, + func.count(ClientLog.id).label("count"), + ) + .filter(ClientLog.timestamp >= since) + .group_by(ClientLog.client_uuid, ClientLog.level) + .all() + ) + + counts_by_client = {} + for stat in log_stats: + if stat.client_uuid not in counts_by_client: + counts_by_client[stat.client_uuid] = { + "ERROR": 0, + "WARN": 0, + "INFO": 0, + "DEBUG": 0, + } + counts_by_client[stat.client_uuid][stat.level.value] = stat.count + + clients_payload = [] + summary_counts = { + "total_clients": 0, + "online_clients": 0, + "offline_clients": 0, + "healthy_clients": 0, + "warning_clients": 0, + "critical_clients": 0, + "error_logs": 0, + "warn_logs": 0, + } + + for client, group_name in clients: + log_counts = counts_by_client.get( + client.uuid, + {"ERROR": 0, "WARN": 0, "INFO": 0, "DEBUG": 0}, + ) + is_alive = _is_client_alive(client.last_alive, client.is_active) + process_status = client.process_status.value if client.process_status else None + screen_health_status = client.screen_health_status.value if client.screen_health_status else None + status = _determine_client_status(is_alive, process_status, screen_health_status, log_counts) + + latest_log = ( + session.query(ClientLog) + .filter_by(client_uuid=client.uuid) + .order_by(desc(ClientLog.timestamp)) + .first() + ) + latest_error = ( + session.query(ClientLog) + .filter_by(client_uuid=client.uuid, level=LogLevel.ERROR) + .order_by(desc(ClientLog.timestamp)) + .first() + ) + + screenshot_ts = client.last_screenshot_analyzed or _infer_last_screenshot_ts(client.uuid) + + clients_payload.append({ + "uuid": client.uuid, + "hostname": client.hostname, + "description": client.description, + "ip": client.ip, + "model": client.model, + "group_id": client.group_id, + "group_name": group_name, + "registration_time": client.registration_time.isoformat() if client.registration_time else None, + "last_alive": client.last_alive.isoformat() if client.last_alive else None, + "is_alive": is_alive, + "status": status, + "current_event_id": client.current_event_id, + "current_process": client.current_process, + "process_status": process_status, + "process_pid": client.process_pid, + "screen_health_status": screen_health_status, + "last_screenshot_analyzed": screenshot_ts.isoformat() if screenshot_ts else None, + "last_screenshot_hash": client.last_screenshot_hash, + "screenshot_url": f"/screenshots/{client.uuid}", + "log_counts_24h": { + "error": log_counts["ERROR"], + "warn": log_counts["WARN"], + "info": log_counts["INFO"], + "debug": log_counts["DEBUG"], + }, + "latest_log": _serialize_log_entry(latest_log), + "latest_error": _serialize_log_entry(latest_error), + }) + + summary_counts["total_clients"] += 1 + summary_counts["error_logs"] += log_counts["ERROR"] + summary_counts["warn_logs"] += log_counts["WARN"] + if is_alive: + summary_counts["online_clients"] += 1 + else: + summary_counts["offline_clients"] += 1 + if status == "healthy": + summary_counts["healthy_clients"] += 1 + elif status == "warning": + summary_counts["warning_clients"] += 1 + elif status == "critical": + summary_counts["critical_clients"] += 1 + + payload = { + "summary": summary_counts, + "period_hours": hours, + "grace_period_seconds": _grace_period_seconds(), + "since": since.isoformat(), + "timestamp": datetime.now(timezone.utc).isoformat(), + "clients": clients_payload, + } + session.close() + return jsonify(dict_to_camel_case(payload)) + + except Exception as e: + session.close() + return jsonify({"error": f"Server error: {str(e)}"}), 500 + + @client_logs_bp.route("/recent-errors", methods=["GET"]) @admin_or_higher def get_recent_errors(): @@ -235,14 +436,7 @@ def get_recent_errors(): result = [] for log in logs: - entry = { - "id": log.id, - "client_uuid": log.client_uuid, - "timestamp": log.timestamp.isoformat() if log.timestamp else None, - "message": log.message, - "context": json.loads(log.context) if log.context else {} - } - result.append(entry) + result.append(_serialize_log_entry(log, include_client_uuid=True)) session.close() return jsonify({ diff --git a/server/routes/clients.py b/server/routes/clients.py index a6406ff..c1edf00 100644 --- a/server/routes/clients.py +++ b/server/routes/clients.py @@ -4,6 +4,7 @@ from flask import Blueprint, request, jsonify from server.permissions import admin_or_higher from server.mqtt_helper import publish_client_group, delete_client_group_message, publish_multiple_client_groups import sys +from datetime import datetime, timezone sys.path.append('/workspace') clients_bp = Blueprint("clients", __name__, url_prefix="/api/clients") @@ -284,21 +285,46 @@ def upload_screenshot(uuid): import os import base64 import glob - from datetime import datetime - session = Session() client = session.query(Client).filter_by(uuid=uuid).first() if not client: session.close() return jsonify({"error": "Client nicht gefunden"}), 404 - session.close() try: + screenshot_timestamp = None + # Handle JSON payload with base64-encoded image if request.is_json: data = request.get_json() if "image" not in data: return jsonify({"error": "Missing 'image' field in JSON payload"}), 400 + + raw_timestamp = data.get("timestamp") + if raw_timestamp is not None: + try: + if isinstance(raw_timestamp, (int, float)): + ts_value = float(raw_timestamp) + if ts_value > 1e12: + ts_value = ts_value / 1000.0 + screenshot_timestamp = datetime.fromtimestamp(ts_value, timezone.utc) + elif isinstance(raw_timestamp, str): + ts = raw_timestamp.strip() + if ts: + if ts.isdigit(): + ts_value = float(ts) + if ts_value > 1e12: + ts_value = ts_value / 1000.0 + screenshot_timestamp = datetime.fromtimestamp(ts_value, timezone.utc) + else: + ts_normalized = ts.replace("Z", "+00:00") if ts.endswith("Z") else ts + screenshot_timestamp = datetime.fromisoformat(ts_normalized) + if screenshot_timestamp.tzinfo is None: + screenshot_timestamp = screenshot_timestamp.replace(tzinfo=timezone.utc) + else: + screenshot_timestamp = screenshot_timestamp.astimezone(timezone.utc) + except Exception: + screenshot_timestamp = None # Decode base64 image image_data = base64.b64decode(data["image"]) @@ -314,7 +340,8 @@ def upload_screenshot(uuid): os.makedirs(screenshots_dir, exist_ok=True) # Store screenshot with timestamp to track latest - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + now_utc = screenshot_timestamp or datetime.now(timezone.utc) + timestamp = now_utc.strftime("%Y%m%d_%H%M%S") filename = f"{uuid}_{timestamp}.jpg" filepath = os.path.join(screenshots_dir, filename) @@ -326,6 +353,10 @@ def upload_screenshot(uuid): with open(latest_filepath, "wb") as f: f.write(image_data) + # Update screenshot receive timestamp for monitoring dashboard + client.last_screenshot_analyzed = now_utc + session.commit() + # Cleanup: keep only last 20 timestamped screenshots per client pattern = os.path.join(screenshots_dir, f"{uuid}_*.jpg") existing_screenshots = sorted(glob.glob(pattern)) @@ -349,7 +380,10 @@ def upload_screenshot(uuid): }), 200 except Exception as e: + session.rollback() return jsonify({"error": f"Failed to process screenshot: {str(e)}"}), 500 + finally: + session.close() @clients_bp.route("/", methods=["DELETE"]) diff --git a/server/routes/events.py b/server/routes/events.py index 2b61d69..00fdf83 100644 --- a/server/routes/events.py +++ b/server/routes/events.py @@ -104,6 +104,9 @@ def get_events(): "end_time": e.end.isoformat() if e.end else None, "is_all_day": False, "media_id": e.event_media_id, + "slideshow_interval": e.slideshow_interval, + "page_progress": e.page_progress, + "auto_progress": e.auto_progress, "type": e.event_type.value if e.event_type else None, "icon": get_icon_for_type(e.event_type.value if e.event_type else None), # Recurrence metadata @@ -267,6 +270,8 @@ def detach_event_occurrence(event_id, occurrence_date): 'event_type': master.event_type, 'event_media_id': master.event_media_id, 'slideshow_interval': getattr(master, 'slideshow_interval', None), + 'page_progress': getattr(master, 'page_progress', None), + 'auto_progress': getattr(master, 'auto_progress', None), 'created_by': master.created_by, } @@ -318,6 +323,8 @@ def detach_event_occurrence(event_id, occurrence_date): event_type=master_data['event_type'], event_media_id=master_data['event_media_id'], slideshow_interval=master_data['slideshow_interval'], + page_progress=data.get("page_progress", master_data['page_progress']), + auto_progress=data.get("auto_progress", master_data['auto_progress']), recurrence_rule=None, recurrence_end=None, skip_holidays=False, @@ -361,11 +368,15 @@ def create_event(): event_type = data["event_type"] event_media_id = None slideshow_interval = None + page_progress = None + auto_progress = None # Präsentation: event_media_id und slideshow_interval übernehmen if event_type == "presentation": event_media_id = data.get("event_media_id") slideshow_interval = data.get("slideshow_interval") + page_progress = data.get("page_progress") + auto_progress = data.get("auto_progress") if not event_media_id: return jsonify({"error": "event_media_id required for presentation"}), 400 @@ -443,6 +454,8 @@ def create_event(): is_active=True, event_media_id=event_media_id, slideshow_interval=slideshow_interval, + page_progress=page_progress, + auto_progress=auto_progress, autoplay=autoplay, loop=loop, volume=volume, @@ -519,6 +532,10 @@ def update_event(event_id): event.event_type = data.get("event_type", event.event_type) event.event_media_id = data.get("event_media_id", event.event_media_id) event.slideshow_interval = data.get("slideshow_interval", event.slideshow_interval) + if "page_progress" in data: + event.page_progress = data.get("page_progress") + if "auto_progress" in data: + event.auto_progress = data.get("auto_progress") # Video-specific fields if "autoplay" in data: event.autoplay = data.get("autoplay") diff --git a/server/wsgi.py b/server/wsgi.py index 53197c8..6ef4bb7 100644 --- a/server/wsgi.py +++ b/server/wsgi.py @@ -69,12 +69,20 @@ def index(): @app.route("/screenshots/") +@app.route("/screenshots/.jpg") def get_screenshot(uuid): - pattern = os.path.join("screenshots", f"{uuid}*.jpg") + normalized_uuid = uuid[:-4] if uuid.lower().endswith('.jpg') else uuid + latest_filename = f"{normalized_uuid}.jpg" + latest_path = os.path.join("screenshots", latest_filename) + if os.path.exists(latest_path): + return send_from_directory("screenshots", latest_filename) + + pattern = os.path.join("screenshots", f"{normalized_uuid}_*.jpg") files = glob.glob(pattern) if not files: # Dummy-Bild als Redirect oder direkt als Response return jsonify({"error": "Screenshot not found", "dummy": "https://placehold.co/400x300?text=No+Screenshot"}), 404 + files.sort(reverse=True) filename = os.path.basename(files[0]) return send_from_directory("screenshots", filename)