Files
infoscreen/server/routes/client_logs.py
Olaf 24cdf07279 feat(monitoring): add priority screenshot pipeline with screenshot_type + docs cleanup
Implement end-to-end support for typed screenshots and priority rendering in monitoring.

Added
- Accept and forward screenshot_type from MQTT screenshot/dashboard payloads
  (periodic, event_start, event_stop)
- Extend screenshot upload handling to persist typed screenshots and metadata
- Add dedicated priority screenshot serving endpoint with fallback behavior
- Extend monitoring overview with priority screenshot fields and summary count
- Add configurable PRIORITY_SCREENSHOT_TTL_SECONDS window for active priority state

Fixed
- Ensure screenshot cache-busting updates reliably via screenshot hash updates
- Preserve normal periodic screenshot flow while introducing event_start/event_stop priority path

Improved
- Monitoring dashboard now displays screenshot type badges
- Adaptive polling: faster refresh while priority screenshots are active
- Priority screenshot presentation is surfaced immediately to operators

Docs
- Update README and copilot-instructions to match new screenshot_type behavior,
  priority endpoint, TTL config, monitoring fields, and retention model
- Remove redundant/duplicate documentation blocks and improve troubleshooting section clarity
2026-03-29 13:13:13 +00:00

492 lines
17 KiB
Python

from flask import Blueprint, jsonify, request
from server.database import Session
from server.permissions import admin_or_higher, superadmin_only
from models.models import ClientLog, Client, ClientGroup, LogLevel
from sqlalchemy import desc, func
from datetime import datetime, timedelta, timezone
import json
import os
import glob
from server.serializers import dict_to_camel_case
client_logs_bp = Blueprint("client_logs", __name__, url_prefix="/api/client-logs")
PRIORITY_SCREENSHOT_TTL_SECONDS = int(os.environ.get("PRIORITY_SCREENSHOT_TTL_SECONDS", "120"))
def _grace_period_seconds():
env = os.environ.get("ENV", "production").lower()
if env in ("development", "dev"):
return int(os.environ.get("HEARTBEAT_GRACE_PERIOD_DEV", "180"))
return int(os.environ.get("HEARTBEAT_GRACE_PERIOD_PROD", "170"))
def _to_utc(dt):
if dt is None:
return None
if dt.tzinfo is None:
return dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
def _is_client_alive(last_alive, is_active):
if not last_alive or not is_active:
return False
return (datetime.now(timezone.utc) - _to_utc(last_alive)) <= timedelta(seconds=_grace_period_seconds())
def _safe_context(raw_context):
if not raw_context:
return {}
try:
return json.loads(raw_context)
except (TypeError, json.JSONDecodeError):
return {"raw": raw_context}
def _serialize_log_entry(log, include_client_uuid=False):
if not log:
return None
entry = {
"id": log.id,
"timestamp": log.timestamp.isoformat() if log.timestamp else None,
"level": log.level.value if log.level else None,
"message": log.message,
"context": _safe_context(log.context),
}
if include_client_uuid:
entry["client_uuid"] = log.client_uuid
return entry
def _determine_client_status(is_alive, process_status, screen_health_status, log_counts):
if not is_alive:
return "offline"
if process_status == "crashed" or screen_health_status in ("BLACK", "FROZEN"):
return "critical"
if log_counts.get("ERROR", 0) > 0:
return "critical"
if process_status in ("starting", "stopped") or log_counts.get("WARN", 0) > 0:
return "warning"
return "healthy"
def _infer_last_screenshot_ts(client_uuid):
screenshots_dir = os.path.join(os.path.dirname(__file__), "..", "screenshots")
candidate_files = []
latest_file = os.path.join(screenshots_dir, f"{client_uuid}.jpg")
if os.path.exists(latest_file):
candidate_files.append(latest_file)
candidate_files.extend(glob.glob(os.path.join(screenshots_dir, f"{client_uuid}_*.jpg")))
if not candidate_files:
return None
try:
newest_path = max(candidate_files, key=os.path.getmtime)
return datetime.fromtimestamp(os.path.getmtime(newest_path), timezone.utc)
except Exception:
return None
def _load_screenshot_metadata(client_uuid):
screenshots_dir = os.path.join(os.path.dirname(__file__), "..", "screenshots")
metadata_path = os.path.join(screenshots_dir, f"{client_uuid}_meta.json")
if not os.path.exists(metadata_path):
return {}
try:
with open(metadata_path, "r", encoding="utf-8") as metadata_file:
data = json.load(metadata_file)
return data if isinstance(data, dict) else {}
except Exception:
return {}
def _is_priority_screenshot_active(priority_received_at):
if not priority_received_at:
return False
try:
normalized = str(priority_received_at).replace("Z", "+00:00")
parsed = datetime.fromisoformat(normalized)
parsed_utc = _to_utc(parsed)
except Exception:
return False
return (datetime.now(timezone.utc) - parsed_utc) <= timedelta(seconds=PRIORITY_SCREENSHOT_TTL_SECONDS)
@client_logs_bp.route("/test", methods=["GET"])
def test_client_logs():
"""Test endpoint to verify logging infrastructure (no auth required)"""
session = Session()
try:
# Count total logs
total_logs = session.query(func.count(ClientLog.id)).scalar()
# Count by level
error_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.ERROR).scalar()
warn_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.WARN).scalar()
info_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.INFO).scalar()
# Get last 5 logs
recent_logs = session.query(ClientLog).order_by(desc(ClientLog.timestamp)).limit(5).all()
recent = []
for log in recent_logs:
recent.append({
"client_uuid": log.client_uuid,
"level": log.level.value if log.level else None,
"message": log.message,
"timestamp": log.timestamp.isoformat() if log.timestamp else None
})
session.close()
return jsonify({
"status": "ok",
"infrastructure": "working",
"total_logs": total_logs,
"counts": {
"ERROR": error_count,
"WARN": warn_count,
"INFO": info_count
},
"recent_5": recent
})
except Exception as e:
session.close()
return jsonify({"status": "error", "message": str(e)}), 500
@client_logs_bp.route("/<uuid>/logs", methods=["GET"])
@admin_or_higher
def get_client_logs(uuid):
"""
Get logs for a specific client
Query params:
- level: ERROR, WARN, INFO, DEBUG (optional)
- limit: number of entries (default 50, max 500)
- since: ISO timestamp (optional)
Example: /api/client-logs/abc-123/logs?level=ERROR&limit=100
"""
session = Session()
try:
# Verify client exists
client = session.query(Client).filter_by(uuid=uuid).first()
if not client:
session.close()
return jsonify({"error": "Client not found"}), 404
# Parse query parameters
level_param = request.args.get('level')
limit = min(int(request.args.get('limit', 50)), 500)
since_param = request.args.get('since')
# Build query
query = session.query(ClientLog).filter_by(client_uuid=uuid)
# Filter by log level
if level_param:
try:
level_enum = LogLevel[level_param.upper()]
query = query.filter_by(level=level_enum)
except KeyError:
session.close()
return jsonify({"error": f"Invalid level: {level_param}. Must be ERROR, WARN, INFO, or DEBUG"}), 400
# Filter by timestamp
if since_param:
try:
# Handle both with and without 'Z' suffix
since_str = since_param.replace('Z', '+00:00')
since_dt = datetime.fromisoformat(since_str)
if since_dt.tzinfo is None:
since_dt = since_dt.replace(tzinfo=timezone.utc)
query = query.filter(ClientLog.timestamp >= since_dt)
except ValueError:
session.close()
return jsonify({"error": "Invalid timestamp format. Use ISO 8601"}), 400
# Execute query
logs = query.order_by(desc(ClientLog.timestamp)).limit(limit).all()
# Format results
result = []
for log in logs:
result.append(_serialize_log_entry(log))
session.close()
return jsonify({
"client_uuid": uuid,
"logs": result,
"count": len(result),
"limit": limit
})
except Exception as e:
session.close()
return jsonify({"error": f"Server error: {str(e)}"}), 500
@client_logs_bp.route("/summary", methods=["GET"])
@admin_or_higher
def get_logs_summary():
"""
Get summary of errors/warnings across all clients in last 24 hours
Returns count of ERROR, WARN, INFO logs per client
Example response:
{
"summary": {
"client-uuid-1": {"ERROR": 5, "WARN": 12, "INFO": 45},
"client-uuid-2": {"ERROR": 0, "WARN": 3, "INFO": 20}
},
"period_hours": 24,
"timestamp": "2026-03-09T21:00:00Z"
}
"""
session = Session()
try:
# Get hours parameter (default 24, max 168 = 1 week)
hours = min(int(request.args.get('hours', 24)), 168)
since = datetime.now(timezone.utc) - timedelta(hours=hours)
# Query log counts grouped by client and level
stats = session.query(
ClientLog.client_uuid,
ClientLog.level,
func.count(ClientLog.id).label('count')
).filter(
ClientLog.timestamp >= since
).group_by(
ClientLog.client_uuid,
ClientLog.level
).all()
# Build summary dictionary
summary = {}
for stat in stats:
uuid = stat.client_uuid
if uuid not in summary:
# Initialize all levels to 0
summary[uuid] = {
"ERROR": 0,
"WARN": 0,
"INFO": 0,
"DEBUG": 0
}
summary[uuid][stat.level.value] = stat.count
# Get client info for enrichment
clients = session.query(Client.uuid, Client.hostname, Client.description).all()
client_info = {c.uuid: {"hostname": c.hostname, "description": c.description} for c in clients}
# Enrich summary with client info
enriched_summary = {}
for uuid, counts in summary.items():
enriched_summary[uuid] = {
"counts": counts,
"info": client_info.get(uuid, {})
}
session.close()
return jsonify({
"summary": enriched_summary,
"period_hours": hours,
"since": since.isoformat(),
"timestamp": datetime.now(timezone.utc).isoformat()
})
except Exception as e:
session.close()
return jsonify({"error": f"Server error: {str(e)}"}), 500
@client_logs_bp.route("/monitoring-overview", methods=["GET"])
@superadmin_only
def get_monitoring_overview():
"""Return a dashboard-friendly monitoring overview for all clients."""
session = Session()
try:
hours = min(int(request.args.get("hours", 24)), 168)
since = datetime.now(timezone.utc) - timedelta(hours=hours)
clients = (
session.query(Client, ClientGroup.name.label("group_name"))
.outerjoin(ClientGroup, Client.group_id == ClientGroup.id)
.order_by(ClientGroup.name.asc(), Client.description.asc(), Client.hostname.asc(), Client.uuid.asc())
.all()
)
log_stats = (
session.query(
ClientLog.client_uuid,
ClientLog.level,
func.count(ClientLog.id).label("count"),
)
.filter(ClientLog.timestamp >= since)
.group_by(ClientLog.client_uuid, ClientLog.level)
.all()
)
counts_by_client = {}
for stat in log_stats:
if stat.client_uuid not in counts_by_client:
counts_by_client[stat.client_uuid] = {
"ERROR": 0,
"WARN": 0,
"INFO": 0,
"DEBUG": 0,
}
counts_by_client[stat.client_uuid][stat.level.value] = stat.count
clients_payload = []
summary_counts = {
"total_clients": 0,
"online_clients": 0,
"offline_clients": 0,
"healthy_clients": 0,
"warning_clients": 0,
"critical_clients": 0,
"error_logs": 0,
"warn_logs": 0,
"active_priority_screenshots": 0,
}
for client, group_name in clients:
log_counts = counts_by_client.get(
client.uuid,
{"ERROR": 0, "WARN": 0, "INFO": 0, "DEBUG": 0},
)
is_alive = _is_client_alive(client.last_alive, client.is_active)
process_status = client.process_status.value if client.process_status else None
screen_health_status = client.screen_health_status.value if client.screen_health_status else None
status = _determine_client_status(is_alive, process_status, screen_health_status, log_counts)
latest_log = (
session.query(ClientLog)
.filter_by(client_uuid=client.uuid)
.order_by(desc(ClientLog.timestamp))
.first()
)
latest_error = (
session.query(ClientLog)
.filter_by(client_uuid=client.uuid, level=LogLevel.ERROR)
.order_by(desc(ClientLog.timestamp))
.first()
)
screenshot_ts = client.last_screenshot_analyzed or _infer_last_screenshot_ts(client.uuid)
screenshot_meta = _load_screenshot_metadata(client.uuid)
latest_screenshot_type = screenshot_meta.get("latest_screenshot_type") or "periodic"
priority_screenshot_type = screenshot_meta.get("last_priority_screenshot_type")
priority_screenshot_received_at = screenshot_meta.get("last_priority_received_at")
has_active_priority = _is_priority_screenshot_active(priority_screenshot_received_at)
screenshot_url = f"/screenshots/{client.uuid}/priority" if has_active_priority else f"/screenshots/{client.uuid}"
clients_payload.append({
"uuid": client.uuid,
"hostname": client.hostname,
"description": client.description,
"ip": client.ip,
"model": client.model,
"group_id": client.group_id,
"group_name": group_name,
"registration_time": client.registration_time.isoformat() if client.registration_time else None,
"last_alive": client.last_alive.isoformat() if client.last_alive else None,
"is_alive": is_alive,
"status": status,
"current_event_id": client.current_event_id,
"current_process": client.current_process,
"process_status": process_status,
"process_pid": client.process_pid,
"screen_health_status": screen_health_status,
"last_screenshot_analyzed": screenshot_ts.isoformat() if screenshot_ts else None,
"last_screenshot_hash": client.last_screenshot_hash,
"latest_screenshot_type": latest_screenshot_type,
"priority_screenshot_type": priority_screenshot_type,
"priority_screenshot_received_at": priority_screenshot_received_at,
"has_active_priority_screenshot": has_active_priority,
"screenshot_url": screenshot_url,
"log_counts_24h": {
"error": log_counts["ERROR"],
"warn": log_counts["WARN"],
"info": log_counts["INFO"],
"debug": log_counts["DEBUG"],
},
"latest_log": _serialize_log_entry(latest_log),
"latest_error": _serialize_log_entry(latest_error),
})
summary_counts["total_clients"] += 1
summary_counts["error_logs"] += log_counts["ERROR"]
summary_counts["warn_logs"] += log_counts["WARN"]
if has_active_priority:
summary_counts["active_priority_screenshots"] += 1
if is_alive:
summary_counts["online_clients"] += 1
else:
summary_counts["offline_clients"] += 1
if status == "healthy":
summary_counts["healthy_clients"] += 1
elif status == "warning":
summary_counts["warning_clients"] += 1
elif status == "critical":
summary_counts["critical_clients"] += 1
payload = {
"summary": summary_counts,
"period_hours": hours,
"grace_period_seconds": _grace_period_seconds(),
"since": since.isoformat(),
"timestamp": datetime.now(timezone.utc).isoformat(),
"clients": clients_payload,
}
session.close()
return jsonify(dict_to_camel_case(payload))
except Exception as e:
session.close()
return jsonify({"error": f"Server error: {str(e)}"}), 500
@client_logs_bp.route("/recent-errors", methods=["GET"])
@admin_or_higher
def get_recent_errors():
"""
Get recent ERROR logs across all clients
Query params:
- limit: number of entries (default 20, max 100)
Useful for system-wide error monitoring
"""
session = Session()
try:
limit = min(int(request.args.get('limit', 20)), 100)
# Get recent errors from all clients
logs = session.query(ClientLog).filter_by(
level=LogLevel.ERROR
).order_by(
desc(ClientLog.timestamp)
).limit(limit).all()
result = []
for log in logs:
result.append(_serialize_log_entry(log, include_client_uuid=True))
session.close()
return jsonify({
"errors": result,
"count": len(result)
})
except Exception as e:
session.close()
return jsonify({"error": f"Server error: {str(e)}"}), 500