- Add GET /api/clients/crashed endpoint (process_status=crashed or stale heartbeat) - Add restart_app command action with same lifecycle + lockout as reboot_host - Scheduler: crash auto-recovery loop (CRASH_RECOVERY_ENABLED flag, lockout, MQTT publish) - Scheduler: unconditional command expiry sweep per poll cycle (sweep_expired_commands) - Listener: subscribe to infoscreen/+/service_failed; persist service_failed_at + unit - Listener: extract broker_connection block from health payload; persist reconnect_count + last_disconnect_at - DB migration b1c2d3e4f5a6: service_failed_at, service_failed_unit, mqtt_reconnect_count, mqtt_last_disconnect_at on clients - Add GET /api/clients/service_failed and POST /api/clients/<uuid>/clear_service_failed - Monitoring overview API: include mqtt_reconnect_count + mqtt_last_disconnect_at per client - Frontend: orange service-failed alert panel (hidden when empty, auto-refresh, quittieren action) - Frontend: MQTT reconnect count + last disconnect in client detail panel - MQTT auth hardening: listener/scheduler/server use env credentials; broker enforces allow_anonymous false - Client command lifecycle foundation: ClientCommand model, reboot_host/shutdown_host, full ACK lifecycle - Docs: TECH-CHANGELOG, DEV-CHANGELOG, MQTT_EVENT_PAYLOAD_GUIDE, copilot-instructions updated - Add implementation-plans/, RESTART_VALIDATION_CHECKLIST.md, TODO.md
494 lines
18 KiB
Python
494 lines
18 KiB
Python
from flask import Blueprint, jsonify, request
|
|
from server.database import Session
|
|
from server.permissions import admin_or_higher, superadmin_only
|
|
from models.models import ClientLog, Client, ClientGroup, LogLevel
|
|
from sqlalchemy import desc, func
|
|
from datetime import datetime, timedelta, timezone
|
|
import json
|
|
import os
|
|
import glob
|
|
|
|
from server.serializers import dict_to_camel_case
|
|
|
|
client_logs_bp = Blueprint("client_logs", __name__, url_prefix="/api/client-logs")
|
|
PRIORITY_SCREENSHOT_TTL_SECONDS = int(os.environ.get("PRIORITY_SCREENSHOT_TTL_SECONDS", "120"))
|
|
|
|
|
|
def _grace_period_seconds():
|
|
env = os.environ.get("ENV", "production").lower()
|
|
if env in ("development", "dev"):
|
|
return int(os.environ.get("HEARTBEAT_GRACE_PERIOD_DEV", "180"))
|
|
return int(os.environ.get("HEARTBEAT_GRACE_PERIOD_PROD", "170"))
|
|
|
|
|
|
def _to_utc(dt):
|
|
if dt is None:
|
|
return None
|
|
if dt.tzinfo is None:
|
|
return dt.replace(tzinfo=timezone.utc)
|
|
return dt.astimezone(timezone.utc)
|
|
|
|
|
|
def _is_client_alive(last_alive, is_active):
|
|
if not last_alive or not is_active:
|
|
return False
|
|
return (datetime.now(timezone.utc) - _to_utc(last_alive)) <= timedelta(seconds=_grace_period_seconds())
|
|
|
|
|
|
def _safe_context(raw_context):
|
|
if not raw_context:
|
|
return {}
|
|
try:
|
|
return json.loads(raw_context)
|
|
except (TypeError, json.JSONDecodeError):
|
|
return {"raw": raw_context}
|
|
|
|
|
|
def _serialize_log_entry(log, include_client_uuid=False):
|
|
if not log:
|
|
return None
|
|
|
|
entry = {
|
|
"id": log.id,
|
|
"timestamp": log.timestamp.isoformat() if log.timestamp else None,
|
|
"level": log.level.value if log.level else None,
|
|
"message": log.message,
|
|
"context": _safe_context(log.context),
|
|
}
|
|
if include_client_uuid:
|
|
entry["client_uuid"] = log.client_uuid
|
|
return entry
|
|
|
|
|
|
def _determine_client_status(is_alive, process_status, screen_health_status, log_counts):
|
|
if not is_alive:
|
|
return "offline"
|
|
if process_status == "crashed" or screen_health_status in ("BLACK", "FROZEN"):
|
|
return "critical"
|
|
if log_counts.get("ERROR", 0) > 0:
|
|
return "critical"
|
|
if process_status in ("starting", "stopped") or log_counts.get("WARN", 0) > 0:
|
|
return "warning"
|
|
return "healthy"
|
|
|
|
|
|
def _infer_last_screenshot_ts(client_uuid):
|
|
screenshots_dir = os.path.join(os.path.dirname(__file__), "..", "screenshots")
|
|
|
|
candidate_files = []
|
|
latest_file = os.path.join(screenshots_dir, f"{client_uuid}.jpg")
|
|
if os.path.exists(latest_file):
|
|
candidate_files.append(latest_file)
|
|
|
|
candidate_files.extend(glob.glob(os.path.join(screenshots_dir, f"{client_uuid}_*.jpg")))
|
|
if not candidate_files:
|
|
return None
|
|
|
|
try:
|
|
newest_path = max(candidate_files, key=os.path.getmtime)
|
|
return datetime.fromtimestamp(os.path.getmtime(newest_path), timezone.utc)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _load_screenshot_metadata(client_uuid):
|
|
screenshots_dir = os.path.join(os.path.dirname(__file__), "..", "screenshots")
|
|
metadata_path = os.path.join(screenshots_dir, f"{client_uuid}_meta.json")
|
|
if not os.path.exists(metadata_path):
|
|
return {}
|
|
|
|
try:
|
|
with open(metadata_path, "r", encoding="utf-8") as metadata_file:
|
|
data = json.load(metadata_file)
|
|
return data if isinstance(data, dict) else {}
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def _is_priority_screenshot_active(priority_received_at):
|
|
if not priority_received_at:
|
|
return False
|
|
|
|
try:
|
|
normalized = str(priority_received_at).replace("Z", "+00:00")
|
|
parsed = datetime.fromisoformat(normalized)
|
|
parsed_utc = _to_utc(parsed)
|
|
except Exception:
|
|
return False
|
|
|
|
return (datetime.now(timezone.utc) - parsed_utc) <= timedelta(seconds=PRIORITY_SCREENSHOT_TTL_SECONDS)
|
|
|
|
|
|
@client_logs_bp.route("/test", methods=["GET"])
|
|
def test_client_logs():
|
|
"""Test endpoint to verify logging infrastructure (no auth required)"""
|
|
session = Session()
|
|
try:
|
|
# Count total logs
|
|
total_logs = session.query(func.count(ClientLog.id)).scalar()
|
|
|
|
# Count by level
|
|
error_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.ERROR).scalar()
|
|
warn_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.WARN).scalar()
|
|
info_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.INFO).scalar()
|
|
|
|
# Get last 5 logs
|
|
recent_logs = session.query(ClientLog).order_by(desc(ClientLog.timestamp)).limit(5).all()
|
|
|
|
recent = []
|
|
for log in recent_logs:
|
|
recent.append({
|
|
"client_uuid": log.client_uuid,
|
|
"level": log.level.value if log.level else None,
|
|
"message": log.message,
|
|
"timestamp": log.timestamp.isoformat() if log.timestamp else None
|
|
})
|
|
|
|
session.close()
|
|
return jsonify({
|
|
"status": "ok",
|
|
"infrastructure": "working",
|
|
"total_logs": total_logs,
|
|
"counts": {
|
|
"ERROR": error_count,
|
|
"WARN": warn_count,
|
|
"INFO": info_count
|
|
},
|
|
"recent_5": recent
|
|
})
|
|
except Exception as e:
|
|
session.close()
|
|
return jsonify({"status": "error", "message": str(e)}), 500
|
|
|
|
|
|
@client_logs_bp.route("/<uuid>/logs", methods=["GET"])
|
|
@admin_or_higher
|
|
def get_client_logs(uuid):
|
|
"""
|
|
Get logs for a specific client
|
|
Query params:
|
|
- level: ERROR, WARN, INFO, DEBUG (optional)
|
|
- limit: number of entries (default 50, max 500)
|
|
- since: ISO timestamp (optional)
|
|
|
|
Example: /api/client-logs/abc-123/logs?level=ERROR&limit=100
|
|
"""
|
|
session = Session()
|
|
try:
|
|
# Verify client exists
|
|
client = session.query(Client).filter_by(uuid=uuid).first()
|
|
if not client:
|
|
session.close()
|
|
return jsonify({"error": "Client not found"}), 404
|
|
|
|
# Parse query parameters
|
|
level_param = request.args.get('level')
|
|
limit = min(int(request.args.get('limit', 50)), 500)
|
|
since_param = request.args.get('since')
|
|
|
|
# Build query
|
|
query = session.query(ClientLog).filter_by(client_uuid=uuid)
|
|
|
|
# Filter by log level
|
|
if level_param:
|
|
try:
|
|
level_enum = LogLevel[level_param.upper()]
|
|
query = query.filter_by(level=level_enum)
|
|
except KeyError:
|
|
session.close()
|
|
return jsonify({"error": f"Invalid level: {level_param}. Must be ERROR, WARN, INFO, or DEBUG"}), 400
|
|
|
|
# Filter by timestamp
|
|
if since_param:
|
|
try:
|
|
# Handle both with and without 'Z' suffix
|
|
since_str = since_param.replace('Z', '+00:00')
|
|
since_dt = datetime.fromisoformat(since_str)
|
|
if since_dt.tzinfo is None:
|
|
since_dt = since_dt.replace(tzinfo=timezone.utc)
|
|
query = query.filter(ClientLog.timestamp >= since_dt)
|
|
except ValueError:
|
|
session.close()
|
|
return jsonify({"error": "Invalid timestamp format. Use ISO 8601"}), 400
|
|
|
|
# Execute query
|
|
logs = query.order_by(desc(ClientLog.timestamp)).limit(limit).all()
|
|
|
|
# Format results
|
|
result = []
|
|
for log in logs:
|
|
result.append(_serialize_log_entry(log))
|
|
|
|
session.close()
|
|
return jsonify({
|
|
"client_uuid": uuid,
|
|
"logs": result,
|
|
"count": len(result),
|
|
"limit": limit
|
|
})
|
|
|
|
except Exception as e:
|
|
session.close()
|
|
return jsonify({"error": f"Server error: {str(e)}"}), 500
|
|
|
|
|
|
@client_logs_bp.route("/summary", methods=["GET"])
|
|
@admin_or_higher
|
|
def get_logs_summary():
|
|
"""
|
|
Get summary of errors/warnings across all clients in last 24 hours
|
|
Returns count of ERROR, WARN, INFO logs per client
|
|
|
|
Example response:
|
|
{
|
|
"summary": {
|
|
"client-uuid-1": {"ERROR": 5, "WARN": 12, "INFO": 45},
|
|
"client-uuid-2": {"ERROR": 0, "WARN": 3, "INFO": 20}
|
|
},
|
|
"period_hours": 24,
|
|
"timestamp": "2026-03-09T21:00:00Z"
|
|
}
|
|
"""
|
|
session = Session()
|
|
try:
|
|
# Get hours parameter (default 24, max 168 = 1 week)
|
|
hours = min(int(request.args.get('hours', 24)), 168)
|
|
since = datetime.now(timezone.utc) - timedelta(hours=hours)
|
|
|
|
# Query log counts grouped by client and level
|
|
stats = session.query(
|
|
ClientLog.client_uuid,
|
|
ClientLog.level,
|
|
func.count(ClientLog.id).label('count')
|
|
).filter(
|
|
ClientLog.timestamp >= since
|
|
).group_by(
|
|
ClientLog.client_uuid,
|
|
ClientLog.level
|
|
).all()
|
|
|
|
# Build summary dictionary
|
|
summary = {}
|
|
for stat in stats:
|
|
uuid = stat.client_uuid
|
|
if uuid not in summary:
|
|
# Initialize all levels to 0
|
|
summary[uuid] = {
|
|
"ERROR": 0,
|
|
"WARN": 0,
|
|
"INFO": 0,
|
|
"DEBUG": 0
|
|
}
|
|
|
|
summary[uuid][stat.level.value] = stat.count
|
|
|
|
# Get client info for enrichment
|
|
clients = session.query(Client.uuid, Client.hostname, Client.description).all()
|
|
client_info = {c.uuid: {"hostname": c.hostname, "description": c.description} for c in clients}
|
|
|
|
# Enrich summary with client info
|
|
enriched_summary = {}
|
|
for uuid, counts in summary.items():
|
|
enriched_summary[uuid] = {
|
|
"counts": counts,
|
|
"info": client_info.get(uuid, {})
|
|
}
|
|
|
|
session.close()
|
|
return jsonify({
|
|
"summary": enriched_summary,
|
|
"period_hours": hours,
|
|
"since": since.isoformat(),
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
})
|
|
|
|
except Exception as e:
|
|
session.close()
|
|
return jsonify({"error": f"Server error: {str(e)}"}), 500
|
|
|
|
|
|
@client_logs_bp.route("/monitoring-overview", methods=["GET"])
|
|
@superadmin_only
|
|
def get_monitoring_overview():
|
|
"""Return a dashboard-friendly monitoring overview for all clients."""
|
|
session = Session()
|
|
try:
|
|
hours = min(int(request.args.get("hours", 24)), 168)
|
|
since = datetime.now(timezone.utc) - timedelta(hours=hours)
|
|
|
|
clients = (
|
|
session.query(Client, ClientGroup.name.label("group_name"))
|
|
.outerjoin(ClientGroup, Client.group_id == ClientGroup.id)
|
|
.order_by(ClientGroup.name.asc(), Client.description.asc(), Client.hostname.asc(), Client.uuid.asc())
|
|
.all()
|
|
)
|
|
|
|
log_stats = (
|
|
session.query(
|
|
ClientLog.client_uuid,
|
|
ClientLog.level,
|
|
func.count(ClientLog.id).label("count"),
|
|
)
|
|
.filter(ClientLog.timestamp >= since)
|
|
.group_by(ClientLog.client_uuid, ClientLog.level)
|
|
.all()
|
|
)
|
|
|
|
counts_by_client = {}
|
|
for stat in log_stats:
|
|
if stat.client_uuid not in counts_by_client:
|
|
counts_by_client[stat.client_uuid] = {
|
|
"ERROR": 0,
|
|
"WARN": 0,
|
|
"INFO": 0,
|
|
"DEBUG": 0,
|
|
}
|
|
counts_by_client[stat.client_uuid][stat.level.value] = stat.count
|
|
|
|
clients_payload = []
|
|
summary_counts = {
|
|
"total_clients": 0,
|
|
"online_clients": 0,
|
|
"offline_clients": 0,
|
|
"healthy_clients": 0,
|
|
"warning_clients": 0,
|
|
"critical_clients": 0,
|
|
"error_logs": 0,
|
|
"warn_logs": 0,
|
|
"active_priority_screenshots": 0,
|
|
}
|
|
|
|
for client, group_name in clients:
|
|
log_counts = counts_by_client.get(
|
|
client.uuid,
|
|
{"ERROR": 0, "WARN": 0, "INFO": 0, "DEBUG": 0},
|
|
)
|
|
is_alive = _is_client_alive(client.last_alive, client.is_active)
|
|
process_status = client.process_status.value if client.process_status else None
|
|
screen_health_status = client.screen_health_status.value if client.screen_health_status else None
|
|
status = _determine_client_status(is_alive, process_status, screen_health_status, log_counts)
|
|
|
|
latest_log = (
|
|
session.query(ClientLog)
|
|
.filter_by(client_uuid=client.uuid)
|
|
.order_by(desc(ClientLog.timestamp))
|
|
.first()
|
|
)
|
|
latest_error = (
|
|
session.query(ClientLog)
|
|
.filter_by(client_uuid=client.uuid, level=LogLevel.ERROR)
|
|
.order_by(desc(ClientLog.timestamp))
|
|
.first()
|
|
)
|
|
|
|
screenshot_ts = client.last_screenshot_analyzed or _infer_last_screenshot_ts(client.uuid)
|
|
screenshot_meta = _load_screenshot_metadata(client.uuid)
|
|
latest_screenshot_type = screenshot_meta.get("latest_screenshot_type") or "periodic"
|
|
priority_screenshot_type = screenshot_meta.get("last_priority_screenshot_type")
|
|
priority_screenshot_received_at = screenshot_meta.get("last_priority_received_at")
|
|
has_active_priority = _is_priority_screenshot_active(priority_screenshot_received_at)
|
|
screenshot_url = f"/screenshots/{client.uuid}/priority" if has_active_priority else f"/screenshots/{client.uuid}"
|
|
|
|
clients_payload.append({
|
|
"uuid": client.uuid,
|
|
"hostname": client.hostname,
|
|
"description": client.description,
|
|
"ip": client.ip,
|
|
"model": client.model,
|
|
"group_id": client.group_id,
|
|
"group_name": group_name,
|
|
"registration_time": client.registration_time.isoformat() if client.registration_time else None,
|
|
"last_alive": client.last_alive.isoformat() if client.last_alive else None,
|
|
"is_alive": is_alive,
|
|
"status": status,
|
|
"current_event_id": client.current_event_id,
|
|
"current_process": client.current_process,
|
|
"process_status": process_status,
|
|
"process_pid": client.process_pid,
|
|
"screen_health_status": screen_health_status,
|
|
"last_screenshot_analyzed": screenshot_ts.isoformat() if screenshot_ts else None,
|
|
"last_screenshot_hash": client.last_screenshot_hash,
|
|
"latest_screenshot_type": latest_screenshot_type,
|
|
"priority_screenshot_type": priority_screenshot_type,
|
|
"priority_screenshot_received_at": priority_screenshot_received_at,
|
|
"has_active_priority_screenshot": has_active_priority,
|
|
"screenshot_url": screenshot_url,
|
|
"log_counts_24h": {
|
|
"error": log_counts["ERROR"],
|
|
"warn": log_counts["WARN"],
|
|
"info": log_counts["INFO"],
|
|
"debug": log_counts["DEBUG"],
|
|
},
|
|
"latest_log": _serialize_log_entry(latest_log),
|
|
"latest_error": _serialize_log_entry(latest_error),
|
|
"mqtt_reconnect_count": client.mqtt_reconnect_count,
|
|
"mqtt_last_disconnect_at": client.mqtt_last_disconnect_at.isoformat() if client.mqtt_last_disconnect_at else None,
|
|
})
|
|
|
|
summary_counts["total_clients"] += 1
|
|
summary_counts["error_logs"] += log_counts["ERROR"]
|
|
summary_counts["warn_logs"] += log_counts["WARN"]
|
|
if has_active_priority:
|
|
summary_counts["active_priority_screenshots"] += 1
|
|
if is_alive:
|
|
summary_counts["online_clients"] += 1
|
|
else:
|
|
summary_counts["offline_clients"] += 1
|
|
if status == "healthy":
|
|
summary_counts["healthy_clients"] += 1
|
|
elif status == "warning":
|
|
summary_counts["warning_clients"] += 1
|
|
elif status == "critical":
|
|
summary_counts["critical_clients"] += 1
|
|
|
|
payload = {
|
|
"summary": summary_counts,
|
|
"period_hours": hours,
|
|
"grace_period_seconds": _grace_period_seconds(),
|
|
"since": since.isoformat(),
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"clients": clients_payload,
|
|
}
|
|
session.close()
|
|
return jsonify(dict_to_camel_case(payload))
|
|
|
|
except Exception as e:
|
|
session.close()
|
|
return jsonify({"error": f"Server error: {str(e)}"}), 500
|
|
|
|
|
|
@client_logs_bp.route("/recent-errors", methods=["GET"])
|
|
@admin_or_higher
|
|
def get_recent_errors():
|
|
"""
|
|
Get recent ERROR logs across all clients
|
|
Query params:
|
|
- limit: number of entries (default 20, max 100)
|
|
|
|
Useful for system-wide error monitoring
|
|
"""
|
|
session = Session()
|
|
try:
|
|
limit = min(int(request.args.get('limit', 20)), 100)
|
|
|
|
# Get recent errors from all clients
|
|
logs = session.query(ClientLog).filter_by(
|
|
level=LogLevel.ERROR
|
|
).order_by(
|
|
desc(ClientLog.timestamp)
|
|
).limit(limit).all()
|
|
|
|
result = []
|
|
for log in logs:
|
|
result.append(_serialize_log_entry(log, include_client_uuid=True))
|
|
|
|
session.close()
|
|
return jsonify({
|
|
"errors": result,
|
|
"count": len(result)
|
|
})
|
|
|
|
except Exception as e:
|
|
session.close()
|
|
return jsonify({"error": f"Server error: {str(e)}"}), 500
|