feat(monitoring): add server-side client logging and health infrastructure
- add Alembic migration c1d2e3f4g5h6 for client monitoring:
- create client_logs table with FK to clients.uuid and performance indexes
- extend clients with process/health tracking fields
- extend data model with ClientLog, LogLevel, ProcessStatus, and ScreenHealthStatus
- enhance listener MQTT handling:
- subscribe to logs and health topics
- persist client logs from infoscreen/{uuid}/logs/{level}
- process health payloads and enrich heartbeat-derived client state
- add monitoring API blueprint server/routes/client_logs.py:
- GET /api/client-logs/<uuid>/logs
- GET /api/client-logs/summary
- GET /api/client-logs/recent-errors
- GET /api/client-logs/test
- register client_logs blueprint in server/wsgi.py
- align compose/dev runtime for listener live-code execution
- add client-side implementation docs:
- CLIENT_MONITORING_SPECIFICATION.md
- CLIENT_MONITORING_IMPLEMENTATION_GUIDE.md
- update TECH-CHANGELOG.md and copilot-instructions.md:
- document monitoring changes
- codify post-release technical-notes/no-version-bump convention
This commit is contained in:
@@ -0,0 +1,84 @@
|
||||
"""add client monitoring tables and columns
|
||||
|
||||
Revision ID: c1d2e3f4g5h6
|
||||
Revises: 4f0b8a3e5c20
|
||||
Create Date: 2026-03-09 21:08:38.000000
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = 'c1d2e3f4g5h6'
|
||||
down_revision = '4f0b8a3e5c20'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
bind = op.get_bind()
|
||||
inspector = sa.inspect(bind)
|
||||
|
||||
# 1. Add health monitoring columns to clients table (safe on rerun)
|
||||
existing_client_columns = {c['name'] for c in inspector.get_columns('clients')}
|
||||
if 'current_event_id' not in existing_client_columns:
|
||||
op.add_column('clients', sa.Column('current_event_id', sa.Integer(), nullable=True))
|
||||
if 'current_process' not in existing_client_columns:
|
||||
op.add_column('clients', sa.Column('current_process', sa.String(50), nullable=True))
|
||||
if 'process_status' not in existing_client_columns:
|
||||
op.add_column('clients', sa.Column('process_status', sa.Enum('running', 'crashed', 'starting', 'stopped', name='processstatus'), nullable=True))
|
||||
if 'process_pid' not in existing_client_columns:
|
||||
op.add_column('clients', sa.Column('process_pid', sa.Integer(), nullable=True))
|
||||
if 'last_screenshot_analyzed' not in existing_client_columns:
|
||||
op.add_column('clients', sa.Column('last_screenshot_analyzed', sa.TIMESTAMP(timezone=True), nullable=True))
|
||||
if 'screen_health_status' not in existing_client_columns:
|
||||
op.add_column('clients', sa.Column('screen_health_status', sa.Enum('OK', 'BLACK', 'FROZEN', 'UNKNOWN', name='screenhealthstatus'), nullable=True, server_default='UNKNOWN'))
|
||||
if 'last_screenshot_hash' not in existing_client_columns:
|
||||
op.add_column('clients', sa.Column('last_screenshot_hash', sa.String(32), nullable=True))
|
||||
|
||||
# 2. Create client_logs table (safe on rerun)
|
||||
if not inspector.has_table('client_logs'):
|
||||
op.create_table('client_logs',
|
||||
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
|
||||
sa.Column('client_uuid', sa.String(36), nullable=False),
|
||||
sa.Column('timestamp', sa.TIMESTAMP(timezone=True), nullable=False),
|
||||
sa.Column('level', sa.Enum('ERROR', 'WARN', 'INFO', 'DEBUG', name='loglevel'), nullable=False),
|
||||
sa.Column('message', sa.Text(), nullable=False),
|
||||
sa.Column('context', sa.JSON(), nullable=True),
|
||||
sa.Column('created_at', sa.TIMESTAMP(timezone=True), server_default=sa.func.current_timestamp(), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.ForeignKeyConstraint(['client_uuid'], ['clients.uuid'], ondelete='CASCADE'),
|
||||
mysql_charset='utf8mb4',
|
||||
mysql_collate='utf8mb4_unicode_ci',
|
||||
mysql_engine='InnoDB'
|
||||
)
|
||||
|
||||
# 3. Create indexes for efficient querying (safe on rerun)
|
||||
client_log_indexes = {idx['name'] for idx in inspector.get_indexes('client_logs')} if inspector.has_table('client_logs') else set()
|
||||
client_indexes = {idx['name'] for idx in inspector.get_indexes('clients')}
|
||||
|
||||
if 'ix_client_logs_client_timestamp' not in client_log_indexes:
|
||||
op.create_index('ix_client_logs_client_timestamp', 'client_logs', ['client_uuid', 'timestamp'])
|
||||
if 'ix_client_logs_level_timestamp' not in client_log_indexes:
|
||||
op.create_index('ix_client_logs_level_timestamp', 'client_logs', ['level', 'timestamp'])
|
||||
if 'ix_clients_process_status' not in client_indexes:
|
||||
op.create_index('ix_clients_process_status', 'clients', ['process_status'])
|
||||
|
||||
|
||||
def downgrade():
|
||||
# Drop indexes
|
||||
op.drop_index('ix_clients_process_status', table_name='clients')
|
||||
op.drop_index('ix_client_logs_level_timestamp', table_name='client_logs')
|
||||
op.drop_index('ix_client_logs_client_timestamp', table_name='client_logs')
|
||||
|
||||
# Drop table
|
||||
op.drop_table('client_logs')
|
||||
|
||||
# Drop columns from clients
|
||||
op.drop_column('clients', 'last_screenshot_hash')
|
||||
op.drop_column('clients', 'screen_health_status')
|
||||
op.drop_column('clients', 'last_screenshot_analyzed')
|
||||
op.drop_column('clients', 'process_pid')
|
||||
op.drop_column('clients', 'process_status')
|
||||
op.drop_column('clients', 'current_process')
|
||||
op.drop_column('clients', 'current_event_id')
|
||||
255
server/routes/client_logs.py
Normal file
255
server/routes/client_logs.py
Normal file
@@ -0,0 +1,255 @@
|
||||
from flask import Blueprint, jsonify, request
|
||||
from server.database import Session
|
||||
from server.permissions import admin_or_higher
|
||||
from models.models import ClientLog, Client, LogLevel
|
||||
from sqlalchemy import desc, func
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import json
|
||||
|
||||
client_logs_bp = Blueprint("client_logs", __name__, url_prefix="/api/client-logs")
|
||||
|
||||
|
||||
@client_logs_bp.route("/test", methods=["GET"])
|
||||
def test_client_logs():
|
||||
"""Test endpoint to verify logging infrastructure (no auth required)"""
|
||||
session = Session()
|
||||
try:
|
||||
# Count total logs
|
||||
total_logs = session.query(func.count(ClientLog.id)).scalar()
|
||||
|
||||
# Count by level
|
||||
error_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.ERROR).scalar()
|
||||
warn_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.WARN).scalar()
|
||||
info_count = session.query(func.count(ClientLog.id)).filter_by(level=LogLevel.INFO).scalar()
|
||||
|
||||
# Get last 5 logs
|
||||
recent_logs = session.query(ClientLog).order_by(desc(ClientLog.timestamp)).limit(5).all()
|
||||
|
||||
recent = []
|
||||
for log in recent_logs:
|
||||
recent.append({
|
||||
"client_uuid": log.client_uuid,
|
||||
"level": log.level.value if log.level else None,
|
||||
"message": log.message,
|
||||
"timestamp": log.timestamp.isoformat() if log.timestamp else None
|
||||
})
|
||||
|
||||
session.close()
|
||||
return jsonify({
|
||||
"status": "ok",
|
||||
"infrastructure": "working",
|
||||
"total_logs": total_logs,
|
||||
"counts": {
|
||||
"ERROR": error_count,
|
||||
"WARN": warn_count,
|
||||
"INFO": info_count
|
||||
},
|
||||
"recent_5": recent
|
||||
})
|
||||
except Exception as e:
|
||||
session.close()
|
||||
return jsonify({"status": "error", "message": str(e)}), 500
|
||||
|
||||
|
||||
@client_logs_bp.route("/<uuid>/logs", methods=["GET"])
|
||||
@admin_or_higher
|
||||
def get_client_logs(uuid):
|
||||
"""
|
||||
Get logs for a specific client
|
||||
Query params:
|
||||
- level: ERROR, WARN, INFO, DEBUG (optional)
|
||||
- limit: number of entries (default 50, max 500)
|
||||
- since: ISO timestamp (optional)
|
||||
|
||||
Example: /api/client-logs/abc-123/logs?level=ERROR&limit=100
|
||||
"""
|
||||
session = Session()
|
||||
try:
|
||||
# Verify client exists
|
||||
client = session.query(Client).filter_by(uuid=uuid).first()
|
||||
if not client:
|
||||
session.close()
|
||||
return jsonify({"error": "Client not found"}), 404
|
||||
|
||||
# Parse query parameters
|
||||
level_param = request.args.get('level')
|
||||
limit = min(int(request.args.get('limit', 50)), 500)
|
||||
since_param = request.args.get('since')
|
||||
|
||||
# Build query
|
||||
query = session.query(ClientLog).filter_by(client_uuid=uuid)
|
||||
|
||||
# Filter by log level
|
||||
if level_param:
|
||||
try:
|
||||
level_enum = LogLevel[level_param.upper()]
|
||||
query = query.filter_by(level=level_enum)
|
||||
except KeyError:
|
||||
session.close()
|
||||
return jsonify({"error": f"Invalid level: {level_param}. Must be ERROR, WARN, INFO, or DEBUG"}), 400
|
||||
|
||||
# Filter by timestamp
|
||||
if since_param:
|
||||
try:
|
||||
# Handle both with and without 'Z' suffix
|
||||
since_str = since_param.replace('Z', '+00:00')
|
||||
since_dt = datetime.fromisoformat(since_str)
|
||||
if since_dt.tzinfo is None:
|
||||
since_dt = since_dt.replace(tzinfo=timezone.utc)
|
||||
query = query.filter(ClientLog.timestamp >= since_dt)
|
||||
except ValueError:
|
||||
session.close()
|
||||
return jsonify({"error": "Invalid timestamp format. Use ISO 8601"}), 400
|
||||
|
||||
# Execute query
|
||||
logs = query.order_by(desc(ClientLog.timestamp)).limit(limit).all()
|
||||
|
||||
# Format results
|
||||
result = []
|
||||
for log in logs:
|
||||
entry = {
|
||||
"id": log.id,
|
||||
"timestamp": log.timestamp.isoformat() if log.timestamp else None,
|
||||
"level": log.level.value if log.level else None,
|
||||
"message": log.message,
|
||||
"context": {}
|
||||
}
|
||||
|
||||
# Parse context JSON
|
||||
if log.context:
|
||||
try:
|
||||
entry["context"] = json.loads(log.context)
|
||||
except json.JSONDecodeError:
|
||||
entry["context"] = {"raw": log.context}
|
||||
|
||||
result.append(entry)
|
||||
|
||||
session.close()
|
||||
return jsonify({
|
||||
"client_uuid": uuid,
|
||||
"logs": result,
|
||||
"count": len(result),
|
||||
"limit": limit
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
session.close()
|
||||
return jsonify({"error": f"Server error: {str(e)}"}), 500
|
||||
|
||||
|
||||
@client_logs_bp.route("/summary", methods=["GET"])
|
||||
@admin_or_higher
|
||||
def get_logs_summary():
|
||||
"""
|
||||
Get summary of errors/warnings across all clients in last 24 hours
|
||||
Returns count of ERROR, WARN, INFO logs per client
|
||||
|
||||
Example response:
|
||||
{
|
||||
"summary": {
|
||||
"client-uuid-1": {"ERROR": 5, "WARN": 12, "INFO": 45},
|
||||
"client-uuid-2": {"ERROR": 0, "WARN": 3, "INFO": 20}
|
||||
},
|
||||
"period_hours": 24,
|
||||
"timestamp": "2026-03-09T21:00:00Z"
|
||||
}
|
||||
"""
|
||||
session = Session()
|
||||
try:
|
||||
# Get hours parameter (default 24, max 168 = 1 week)
|
||||
hours = min(int(request.args.get('hours', 24)), 168)
|
||||
since = datetime.now(timezone.utc) - timedelta(hours=hours)
|
||||
|
||||
# Query log counts grouped by client and level
|
||||
stats = session.query(
|
||||
ClientLog.client_uuid,
|
||||
ClientLog.level,
|
||||
func.count(ClientLog.id).label('count')
|
||||
).filter(
|
||||
ClientLog.timestamp >= since
|
||||
).group_by(
|
||||
ClientLog.client_uuid,
|
||||
ClientLog.level
|
||||
).all()
|
||||
|
||||
# Build summary dictionary
|
||||
summary = {}
|
||||
for stat in stats:
|
||||
uuid = stat.client_uuid
|
||||
if uuid not in summary:
|
||||
# Initialize all levels to 0
|
||||
summary[uuid] = {
|
||||
"ERROR": 0,
|
||||
"WARN": 0,
|
||||
"INFO": 0,
|
||||
"DEBUG": 0
|
||||
}
|
||||
|
||||
summary[uuid][stat.level.value] = stat.count
|
||||
|
||||
# Get client info for enrichment
|
||||
clients = session.query(Client.uuid, Client.hostname, Client.description).all()
|
||||
client_info = {c.uuid: {"hostname": c.hostname, "description": c.description} for c in clients}
|
||||
|
||||
# Enrich summary with client info
|
||||
enriched_summary = {}
|
||||
for uuid, counts in summary.items():
|
||||
enriched_summary[uuid] = {
|
||||
"counts": counts,
|
||||
"info": client_info.get(uuid, {})
|
||||
}
|
||||
|
||||
session.close()
|
||||
return jsonify({
|
||||
"summary": enriched_summary,
|
||||
"period_hours": hours,
|
||||
"since": since.isoformat(),
|
||||
"timestamp": datetime.now(timezone.utc).isoformat()
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
session.close()
|
||||
return jsonify({"error": f"Server error: {str(e)}"}), 500
|
||||
|
||||
|
||||
@client_logs_bp.route("/recent-errors", methods=["GET"])
|
||||
@admin_or_higher
|
||||
def get_recent_errors():
|
||||
"""
|
||||
Get recent ERROR logs across all clients
|
||||
Query params:
|
||||
- limit: number of entries (default 20, max 100)
|
||||
|
||||
Useful for system-wide error monitoring
|
||||
"""
|
||||
session = Session()
|
||||
try:
|
||||
limit = min(int(request.args.get('limit', 20)), 100)
|
||||
|
||||
# Get recent errors from all clients
|
||||
logs = session.query(ClientLog).filter_by(
|
||||
level=LogLevel.ERROR
|
||||
).order_by(
|
||||
desc(ClientLog.timestamp)
|
||||
).limit(limit).all()
|
||||
|
||||
result = []
|
||||
for log in logs:
|
||||
entry = {
|
||||
"id": log.id,
|
||||
"client_uuid": log.client_uuid,
|
||||
"timestamp": log.timestamp.isoformat() if log.timestamp else None,
|
||||
"message": log.message,
|
||||
"context": json.loads(log.context) if log.context else {}
|
||||
}
|
||||
result.append(entry)
|
||||
|
||||
session.close()
|
||||
return jsonify({
|
||||
"errors": result,
|
||||
"count": len(result)
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
session.close()
|
||||
return jsonify({"error": f"Server error: {str(e)}"}), 500
|
||||
@@ -8,6 +8,7 @@ from server.routes.holidays import holidays_bp
|
||||
from server.routes.academic_periods import academic_periods_bp
|
||||
from server.routes.groups import groups_bp
|
||||
from server.routes.clients import clients_bp
|
||||
from server.routes.client_logs import client_logs_bp
|
||||
from server.routes.auth import auth_bp
|
||||
from server.routes.users import users_bp
|
||||
from server.routes.system_settings import system_settings_bp
|
||||
@@ -46,6 +47,7 @@ else:
|
||||
app.register_blueprint(auth_bp)
|
||||
app.register_blueprint(users_bp)
|
||||
app.register_blueprint(clients_bp)
|
||||
app.register_blueprint(client_logs_bp)
|
||||
app.register_blueprint(groups_bp)
|
||||
app.register_blueprint(events_bp)
|
||||
app.register_blueprint(event_exceptions_bp)
|
||||
|
||||
Reference in New Issue
Block a user