diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 6c68e72..285c4dc 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -23,12 +23,25 @@ Use this as your shared context when proposing changes. Keep edits minimal and m - Per-client group assignment (retained): `infoscreen/{uuid}/group_id` via `server/mqtt_helper.py`. - Screenshots: server-side folders `server/received_screenshots/` and `server/screenshots/`; Nginx exposes `/screenshots/{uuid}.jpg` via `server/wsgi.py` route. +- Presentation conversion (PPT/PPTX/ODP β†’ PDF): + - Trigger: on upload in `server/routes/eventmedia.py` for media types `ppt|pptx|odp` (compute sha256, upsert `Conversion`, enqueue job). + - Worker: RQ worker runs `server.worker.convert_event_media_to_pdf`, calls Gotenberg LibreOffice endpoint, writes to `server/media/converted/`. + - Services: Redis (queue) and Gotenberg added in compose; worker service consumes the `conversions` queue. + - Env: `REDIS_URL` (default `redis://redis:6379/0`), `GOTENBERG_URL` (default `http://gotenberg:3000`). + - Endpoints: `POST /api/conversions//pdf` (ensure/enqueue), `GET /api/conversions//status`, `GET /api/files/converted/` (serve PDFs). + - Storage: originals under `server/media/…`, outputs under `server/media/converted/` (prod compose mounts a shared volume for this path). + ## Data model highlights (see `models/models.py`) - Enums: `EventType` (presentation, website, video, message, webuntis), `MediaType` (file/website types), and `AcademicPeriodType` (schuljahr, semester, trimester). - Tables: `clients`, `client_groups`, `events`, `event_media`, `users`, `academic_periods`, `school_holidays`. - Academic periods: `academic_periods` table supports educational institution cycles (school years, semesters). Events and media can be optionally linked via `academic_period_id` (nullable for backward compatibility). - Times are stored as timezone-aware; treat comparisons in UTC (see scheduler and routes/events). +- Conversions: + - Enum `ConversionStatus`: `pending`, `processing`, `ready`, `failed`. + - Table `conversions`: `id`, `source_event_media_id` (FKβ†’`event_media.id` ondelete CASCADE), `target_format`, `target_path`, `status`, `file_hash` (sha256), `started_at`, `completed_at`, `error_message`. + - Indexes: `(source_event_media_id, target_format)`, `(status, target_format)`; Unique: `(source_event_media_id, target_format, file_hash)`. + ## API patterns - Blueprints live in `server/routes/*` and are registered in `server/wsgi.py` with `/api/...` prefixes. - Session usage: instantiate `Session()` per request, commit when mutating, and always `session.close()` before returning. @@ -51,6 +64,8 @@ Use this as your shared context when proposing changes. Keep edits minimal and m - Holidays present in the current view (count) - Period label (display_name or name) with a badge indicating whether any holidays exist in that period (overlap check) +Note: Syncfusion usage in the dashboard is already documented above; if a UI for conversion status/downloads is added later, link its routes and components here. + ## Local development - Compose: development is `docker-compose.yml` + `docker-compose.override.yml`. - API (dev): `server/Dockerfile.dev` with debugpy on 5678, Flask app `wsgi:app` on :8000. diff --git a/Makefile b/Makefile index 0295812..88c51ec 100644 --- a/Makefile +++ b/Makefile @@ -25,10 +25,12 @@ help: @echo " up-prod - Start prod stack (docker-compose.prod.yml)" @echo " down-prod - Stop prod stack" @echo " health - Quick health checks" + @echo " fix-perms - Recursively chown workspace to current user" + # ---------- Development stack ---------- .PHONY: up -yup: ## Start dev stack +up: ## Start dev stack $(COMPOSE) up -d --build .PHONY: down @@ -80,3 +82,11 @@ health: ## Quick health checks @echo "Dashboard (dev):" && curl -fsS http://localhost:5173/ || true @echo "MQTT TCP 1883:" && nc -z localhost 1883 && echo OK || echo FAIL @echo "MQTT WS 9001:" && nc -z localhost 9001 && echo OK || echo FAIL + +# ---------- Permissions ---------- +.PHONY: fix-perms +fix-perms: + @echo "Fixing ownership to current user recursively (may prompt for sudo password)..." + sudo chown -R $$(id -u):$$(id -g) . + @echo "Done. Consider adding UID and GID to your .env to prevent future root-owned files:" + @echo " echo UID=$$(id -u) >> .env && echo GID=$$(id -g) >> .env" diff --git a/dashboard/vite.config.ts b/dashboard/vite.config.ts index 57e3533..6d59980 100644 --- a/dashboard/vite.config.ts +++ b/dashboard/vite.config.ts @@ -4,6 +4,7 @@ import react from '@vitejs/plugin-react'; // https://vite.dev/config/ export default defineConfig({ + cacheDir: './.vite', plugins: [react()], resolve: { // πŸ”§ KORRIGIERT: Entferne die problematischen Aliases komplett diff --git a/deployment.md b/deployment.md deleted file mode 100644 index 17ec555..0000000 --- a/deployment.md +++ /dev/null @@ -1,417 +0,0 @@ -# Infoscreen Deployment Guide - -Komplette Anleitung fΓΌr das Deployment des Infoscreen-Systems auf einem Ubuntu-Server mit GitHub Container Registry. - -## πŸ“‹ Übersicht - -- **Phase 0**: Docker Installation (optional) -- **Phase 1**: Images bauen und zur Registry pushen -- **Phase 2**: Ubuntu-Server Installation -- **Phase 3**: System-Konfiguration und Start - ---- - -## 🐳 Phase 0: Docker Installation (optional) - -Falls Docker noch nicht installiert ist, wΓ€hlen Sie eine der folgenden Optionen: - -### Option A: Ubuntu Repository (schnell) - -```bash -# Standard Ubuntu Docker-Pakete -sudo apt update -sudo apt install docker.io docker-compose-plugin -y -sudo systemctl enable docker -sudo systemctl start docker -``` - -### Option B: Offizielle Docker-Installation (empfohlen) - -```bash -# Alte Docker-Versionen entfernen -sudo apt remove docker docker-engine docker.io containerd runc -y - -# AbhΓ€ngigkeiten installieren -sudo apt update -sudo apt install ca-certificates curl gnupg lsb-release -y - -# Docker GPG-Key hinzufΓΌgen -sudo mkdir -p /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg - -# Docker Repository hinzufΓΌgen -echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - -# Docker installieren (neueste Version) -sudo apt update -sudo apt install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y - -# Docker aktivieren und starten -sudo systemctl enable docker -sudo systemctl start docker - -# User zur Docker-Gruppe hinzufΓΌgen -sudo usermod -aG docker $USER - -# Neuanmeldung fΓΌr GruppenΓ€nderung erforderlich -exit -# Neu einloggen via SSH -``` - -### Docker-Installation testen - -```bash -# Test-Container ausfΓΌhren -docker run hello-world - -# Docker-Version prΓΌfen -docker --version -docker compose version -``` - ---- - -## πŸ—οΈ Phase 1: Images bauen und pushen (Entwicklungsmaschine) - -### 1. GitHub Container Registry Login - -```bash -# GitHub Personal Access Token mit write:packages Berechtigung erstellen -echo $GITHUB_TOKEN | docker login ghcr.io -u robbstarkaustria --password-stdin - -# Oder interaktiv: -docker login ghcr.io -# Username: robbstarkaustria -# Password: [GITHUB_TOKEN] -``` - -### 2. Images bauen und taggen - -```bash -cd /workspace - -# Server-Image bauen -docker build -f server/Dockerfile -t ghcr.io/robbstarkaustria/infoscreen-api:latest . - -# Dashboard-Image bauen -docker build -f dashboard/Dockerfile -t ghcr.io/robbstarkaustria/infoscreen-dashboard:latest . - -# Listener-Image bauen (falls vorhanden) -docker build -f listener/Dockerfile -t ghcr.io/robbstarkaustria/infoscreen-listener:latest . - -# Scheduler-Image bauen (falls vorhanden) -docker build -f scheduler/Dockerfile -t ghcr.io/robbstarkaustria/infoscreen-scheduler:latest . -``` - -### 3. Images zur Registry pushen - -```bash -# Alle Images pushen -docker push ghcr.io/robbstarkaustria/infoscreen-api:latest -docker push ghcr.io/robbstarkaustria/infoscreen-dashboard:latest -docker push ghcr.io/robbstarkaustria/infoscreen-listener:latest -docker push ghcr.io/robbstarkaustria/infoscreen-scheduler:latest - -# Status prΓΌfen -docker images | grep ghcr.io -``` - ---- - -## πŸ–₯️ Phase 2: Ubuntu-Server Installation - -### 4. Ubuntu Server vorbereiten - -```bash -sudo apt update && sudo apt upgrade -y - -# Grundlegende Tools installieren -sudo apt install git curl wget -y - -# Docker installieren (siehe Phase 0) -``` - -### 5. Deployment-Dateien ΓΌbertragen - -```bash -# Deployment-Ordner erstellen -mkdir -p ~/infoscreen-deployment -cd ~/infoscreen-deployment - -# Dateien vom Dev-System kopieren (ΓΌber SCP) -scp user@dev-machine:/workspace/docker-compose.prod.yml . -scp user@dev-machine:/workspace/.env . -scp user@dev-machine:/workspace/nginx.conf . -scp -r user@dev-machine:/workspace/certs ./ -scp -r user@dev-machine:/workspace/mosquitto ./ - -# Alternative: Deployment-Paket verwenden -# Auf Dev-Maschine (/workspace): -# tar -czf infoscreen-deployment.tar.gz docker-compose.prod.yml .env nginx.conf certs/ mosquitto/ -# scp infoscreen-deployment.tar.gz user@server:~/ -# Auf Server: tar -xzf infoscreen-deployment.tar.gz -``` - -### 6. Mosquitto-Konfiguration vorbereiten - -```bash -# Falls mosquitto-Ordner noch nicht vollstΓ€ndig vorhanden: -mkdir -p mosquitto/{config,data,log} - -# Mosquitto-Konfiguration erstellen (falls nicht ΓΌbertragen) -cat > mosquitto/config/mosquitto.conf << 'EOF' -# ----------------------------- -# Netzwerkkonfiguration -# ----------------------------- -listener 1883 -allow_anonymous true -# password_file /mosquitto/config/passwd - -# WebSocket (optional) -listener 9001 -protocol websockets - -# ----------------------------- -# Persistence & Pfade -# ----------------------------- -persistence true -persistence_location /mosquitto/data/ - -log_dest file /mosquitto/log/mosquitto.log -EOF - -# Berechtigungen fΓΌr Mosquitto setzen -sudo chown -R 1883:1883 mosquitto/data mosquitto/log -chmod 755 mosquitto/config mosquitto/data mosquitto/log -``` - -### 7. Environment-Variablen anpassen - -```bash -# .env fΓΌr Produktionsumgebung anpassen -nano .env - -# Wichtige Anpassungen: -# VITE_API_URL=https://YOUR_SERVER_HOST/api # FΓΌr Dashboard-Build (Production) -# DB_HOST=db # In Containern immer 'db' -# DB_CONN=mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME} -# Alle PasswΓΆrter fΓΌr Produktion Γ€ndern -``` - -Hinweise: -- Eine Vorlage `.env.example` liegt im Repo. Kopiere sie als Ausgangspunkt: `cp .env.example .env`. -- FΓΌr lokale Entwicklung lΓ€dt `server/database.py` die `.env`, wenn `ENV=development` gesetzt ist. -- In Produktion verwaltet Compose/Container die Variablen; kein automatisches `.env`-Load im Code nΓΆtig. - ---- - -## πŸš€ Phase 3: System-Start und Konfiguration - -### 8. Images von Registry pullen - -```bash -# GitHub Container Registry Login (falls private Repository) -echo $GITHUB_TOKEN | docker login ghcr.io -u robbstarkaustria --password-stdin - -# Images pullen -docker compose -f docker-compose.prod.yml pull -``` - -### 9. System starten - -```bash -# Container starten -docker compose -f docker-compose.prod.yml up -d - -# Status prΓΌfen -docker compose ps -docker compose logs -f -``` - -### 10. Firewall konfigurieren - -```bash -sudo ufw enable -sudo ufw allow ssh -sudo ufw allow 80/tcp -sudo ufw allow 443/tcp -sudo ufw allow 1883/tcp # MQTT -sudo ufw allow 9001/tcp # MQTT WebSocket -sudo ufw status -``` - -### 11. Installation validieren - -```bash -# Health-Checks -curl http://localhost/api/health -curl https://localhost -k # -k fΓΌr selbstsignierte Zertifikate - -# Container-Status -docker compose ps - -# Logs bei Problemen anzeigen -docker compose logs server -docker compose logs dashboard -docker compose logs mqtt -``` - ---- - -## πŸ§ͺ Quickstart (Entwicklung) - -Schneller Start der Entwicklungsumgebung mit automatischen Proxys und Hot-Reload. - -```bash -# Im Repository-Root -# 1) .env aus Vorlage erzeugen (lokal, falls noch nicht vorhanden) -cp -n .env.example .env - -# 2) Dev-Stack starten (verwendet docker-compose.yml + docker-compose.override.yml) -docker compose up -d --build - -# 3) Status & Logs -docker compose ps -docker compose logs -f server -docker compose logs -f dashboard -docker compose logs -f mqtt - -# 4) Stack stoppen -docker compose down -``` - -Erreichbarkeit (Dev): -- Dashboard (Vite): http://localhost:5173 -- API (Flask Dev): http://localhost:8000/api -- API Health: http://localhost:8000/health -- Screenshots: http://localhost:8000/screenshots/.jpg -- MQTT: localhost:1883 (WebSocket: localhost:9001) - -Hinweise: -- `ENV=development` lΓ€dt `.env` automatisch in `server/database.py`. -- Vite proxy routet `/api` und `/screenshots` in Dev direkt auf die API (siehe `dashboard/vite.config.ts`). - -### 12. Automatischer Start (optional) - -```bash -# Systemd-Service erstellen -sudo tee /etc/systemd/system/infoscreen.service > /dev/null << 'EOF' -[Unit] -Description=Infoscreen Application -Requires=docker.service -After=docker.service - -[Service] -Type=oneshot -RemainAfterExit=yes -WorkingDirectory=/home/$USER/infoscreen-deployment -ExecStart=/usr/bin/docker compose -f docker-compose.prod.yml up -d -ExecStop=/usr/bin/docker compose -f docker-compose.prod.yml down -TimeoutStartSec=300 - -[Install] -WantedBy=multi-user.target -EOF - -# Service aktivieren -sudo systemctl enable infoscreen.service -sudo systemctl start infoscreen.service -``` - ---- - -## 🌐 Zugriff auf die Anwendung - -Nach erfolgreichem Deployment ist die Anwendung unter folgenden URLs erreichbar: - -- **HTTPS Dashboard**: `https://YOUR_SERVER_IP` -- **HTTP Dashboard**: `http://YOUR_SERVER_IP` (Redirect zu HTTPS) -- **API**: `http://YOUR_SERVER_IP/api/` -- **MQTT**: `YOUR_SERVER_IP:1883` -- **MQTT WebSocket**: `YOUR_SERVER_IP:9001` - ---- - -## πŸ”§ Troubleshooting - -### Container-Status prΓΌfen - -```bash -# Alle Container anzeigen -docker compose ps - -# Spezifische Logs anzeigen -docker compose logs -f [service-name] - -# Container einzeln neustarten -docker compose restart [service-name] -``` - -### System neustarten - -```bash -# Komplett neu starten -docker compose down -docker compose up -d - -# Images neu pullen -docker compose pull -docker compose up -d -``` - -### HΓ€ufige Probleme - -| Problem | LΓΆsung | -|---------|--------| -| Container startet nicht | `docker compose logs [service]` prΓΌfen | -| Ports bereits belegt | `sudo netstat -tulpn \| grep :80` prΓΌfen | -| Keine Berechtigung | User zu docker-Gruppe hinzufΓΌgen | -| DB-Verbindung fehlschlΓ€gt | Environment-Variablen in `.env` prΓΌfen | -| Mosquitto startet nicht | Ordner-Berechtigungen fΓΌr `1883:1883` setzen | - ---- - -## πŸ“Š Docker-Version Vergleich - -| Aspekt | Ubuntu Repository | Offizielle Installation | -|--------|------------------|------------------------| -| **Installation** | βœ… Schnell (1 Befehl) | ⚠️ Mehrere Schritte | -| **Version** | ⚠️ Oft Γ€lter | βœ… Neueste Version | -| **Updates** | βœ… Via apt | βœ… Via apt (nach Setup) | -| **StabilitΓ€t** | βœ… Getestet | βœ… Aktuell | -| **Features** | ⚠️ MΓΆglicherweise eingeschrΓ€nkt | βœ… Alle Features | - -**Empfehlung:** FΓΌr Produktion die offizielle Docker-Installation verwenden. - ---- - -## πŸ“ Wartung - -### RegelmÀßige Updates - -```bash -# Images aktualisieren -docker compose pull -docker compose up -d - -# System-Updates -sudo apt update && sudo apt upgrade -y -``` - -### Backup - -```bash -# Container-Daten sichern -docker compose down -sudo tar -czf infoscreen-backup-$(date +%Y%m%d).tar.gz mosquitto/data/ certs/ - -# Backup wiederherstellen -sudo tar -xzf infoscreen-backup-YYYYMMDD.tar.gz -docker compose up -d -``` - ---- - -**Das Infoscreen-System ist jetzt vollstΓ€ndig ΓΌber GitHub diff --git a/docker-compose.yml b/docker-compose.yml index 3e1b4be..e49c59c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -68,12 +68,16 @@ services: - ./mosquitto/data:/mosquitto/data - ./mosquitto/log:/mosquitto/log ports: - - "1883:1883" # Standard MQTT - - "9001:9001" # WebSocket (falls benΓΆtigt) + - "1883:1883" # Standard MQTT + - "9001:9001" # WebSocket (falls benΓΆtigt) networks: - infoscreen-net healthcheck: - test: ["CMD-SHELL", "mosquitto_pub -h localhost -t test -m 'health' || exit 1"] + test: + [ + "CMD-SHELL", + "mosquitto_pub -h localhost -t test -m 'health' || exit 1", + ] interval: 30s timeout: 5s retries: 3 @@ -98,10 +102,14 @@ services: MQTT_BROKER_URL: ${MQTT_BROKER_URL} MQTT_USER: ${MQTT_USER} MQTT_PASSWORD: ${MQTT_PASSWORD} + REDIS_URL: "${REDIS_URL:-redis://redis:6379/0}" + GOTENBERG_URL: "${GOTENBERG_URL:-http://gotenberg:3000}" ports: - "8000:8000" networks: - infoscreen-net + volumes: + - media-data:/app/server/media healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s @@ -140,6 +148,7 @@ services: scheduler: build: + context: . dockerfile: scheduler/Dockerfile image: infoscreen-scheduler:latest container_name: infoscreen-scheduler @@ -157,6 +166,41 @@ services: networks: - infoscreen-net + redis: + image: redis:7-alpine + container_name: infoscreen-redis + restart: unless-stopped + networks: + - infoscreen-net + + gotenberg: + image: gotenberg/gotenberg:8 + container_name: infoscreen-gotenberg + restart: unless-stopped + networks: + - infoscreen-net + + worker: + build: + context: . + dockerfile: server/Dockerfile + image: infoscreen-worker:latest + container_name: infoscreen-worker + restart: unless-stopped + depends_on: + - redis + - gotenberg + - db + environment: + DB_CONN: "mysql+pymysql://${DB_USER}:${DB_PASSWORD}@db/${DB_NAME}" + REDIS_URL: "${REDIS_URL:-redis://redis:6379/0}" + GOTENBERG_URL: "${GOTENBERG_URL:-http://gotenberg:3000}" + PYTHONPATH: /app + command: ["rq", "worker", "conversions"] + networks: + - infoscreen-net + volumes: server-pip-cache: db-data: + media-data: diff --git a/models/models.py b/models/models.py index 3f14e17..a727ff4 100644 --- a/models/models.py +++ b/models/models.py @@ -227,3 +227,45 @@ class SchoolHoliday(Base): "source_file_name": self.source_file_name, "imported_at": self.imported_at.isoformat() if self.imported_at else None, } + +# --- Conversions: Track PPT/PPTX/ODP -> PDF processing state --- + + +class ConversionStatus(enum.Enum): + pending = "pending" + processing = "processing" + ready = "ready" + failed = "failed" + + +class Conversion(Base): + __tablename__ = 'conversions' + + id = Column(Integer, primary_key=True, autoincrement=True) + # Source media to be converted + source_event_media_id = Column( + Integer, + ForeignKey('event_media.id', ondelete='CASCADE'), + nullable=False, + index=True, + ) + target_format = Column(String(10), nullable=False, + index=True) # e.g. 'pdf' + # relative to server/media + target_path = Column(String(512), nullable=True) + status = Column(Enum(ConversionStatus), nullable=False, + default=ConversionStatus.pending) + file_hash = Column(String(64), nullable=False) # sha256 of source file + started_at = Column(TIMESTAMP(timezone=True), nullable=True) + completed_at = Column(TIMESTAMP(timezone=True), nullable=True) + error_message = Column(Text, nullable=True) + + __table_args__ = ( + # Fast lookup per media/format + Index('ix_conv_source_target', 'source_event_media_id', 'target_format'), + # Operational filtering + Index('ix_conv_status_target', 'status', 'target_format'), + # Idempotency: same source + target + file content should be unique + UniqueConstraint('source_event_media_id', 'target_format', + 'file_hash', name='uq_conv_source_target_hash'), + ) diff --git a/pptx_conversion_guide.md b/pptx_conversion_guide.md new file mode 100644 index 0000000..9e2a21a --- /dev/null +++ b/pptx_conversion_guide.md @@ -0,0 +1,477 @@ +# Recommended Implementation: PPTX-to-PDF Conversion System + +## Architecture Overview + +**Asynchronous server-side conversion with database tracking** + +``` +User Upload β†’ API saves PPTX + DB entry β†’ Job in Queue + ↓ +Client requests β†’ API checks DB status β†’ PDF ready? β†’ Download PDF + β†’ Pending? β†’ "Please wait" + β†’ Failed? β†’ Retry/Error +``` + +## 1. Database Schema + +```sql +CREATE TABLE media_files ( + id UUID PRIMARY KEY, + filename VARCHAR(255), + original_path VARCHAR(512), + file_type VARCHAR(10), + mime_type VARCHAR(100), + uploaded_at TIMESTAMP, + updated_at TIMESTAMP +); + +CREATE TABLE conversions ( + id UUID PRIMARY KEY, + source_file_id UUID REFERENCES media_files(id) ON DELETE CASCADE, + target_format VARCHAR(10), -- 'pdf' + target_path VARCHAR(512), -- Path to generated PDF + status VARCHAR(20), -- 'pending', 'processing', 'ready', 'failed' + started_at TIMESTAMP, + completed_at TIMESTAMP, + error_message TEXT, + file_hash VARCHAR(64) -- Hash of PPTX for cache invalidation +); + +CREATE INDEX idx_conversions_source ON conversions(source_file_id, target_format); +``` + +## 2. Components + +### **API Server (existing)** +- Accepts uploads +- Creates DB entries +- Enqueues jobs +- Delivers status and files + +### **Background Worker (new)** +- Runs as separate process in **same container** as API +- Processes conversion jobs from queue +- Can run multiple worker instances in parallel +- Technology: Python RQ, Celery, or similar + +### **Message Queue** +- Redis (recommended for start - simple, fast) +- Alternative: RabbitMQ for more features + +### **Redis Container (new)** +- Separate container for Redis +- Handles job queue +- Minimal resource footprint + +## 3. Detailed Workflow + +### **Upload Process:** + +```python +@app.post("/upload") +async def upload_file(file): + # 1. Save PPTX + file_path = save_to_disk(file) + + # 2. DB entry for original file + file_record = db.create_media_file({ + 'filename': file.filename, + 'original_path': file_path, + 'file_type': 'pptx' + }) + + # 3. Create conversion record + conversion = db.create_conversion({ + 'source_file_id': file_record.id, + 'target_format': 'pdf', + 'status': 'pending', + 'file_hash': calculate_hash(file_path) + }) + + # 4. Enqueue job (asynchronous!) + queue.enqueue(convert_to_pdf, conversion.id) + + # 5. Return immediately to user + return { + 'file_id': file_record.id, + 'status': 'uploaded', + 'conversion_status': 'pending' + } +``` + +### **Worker Process:** + +```python +def convert_to_pdf(conversion_id): + conversion = db.get_conversion(conversion_id) + source_file = db.get_media_file(conversion.source_file_id) + + # Status update: processing + db.update_conversion(conversion_id, { + 'status': 'processing', + 'started_at': now() + }) + + try: + # LibreOffice Conversion + pdf_path = f"/data/converted/{conversion.id}.pdf" + subprocess.run([ + 'libreoffice', + '--headless', + '--convert-to', 'pdf', + '--outdir', '/data/converted/', + source_file.original_path + ], check=True) + + # Success + db.update_conversion(conversion_id, { + 'status': 'ready', + 'target_path': pdf_path, + 'completed_at': now() + }) + + except Exception as e: + # Error + db.update_conversion(conversion_id, { + 'status': 'failed', + 'error_message': str(e), + 'completed_at': now() + }) +``` + +### **Client Download:** + +```python +@app.get("/files/{file_id}/display") +async def get_display_file(file_id): + file = db.get_media_file(file_id) + + # Only for PPTX: check PDF conversion + if file.file_type == 'pptx': + conversion = db.get_latest_conversion(file.id, target_format='pdf') + + if not conversion: + # Shouldn't happen, but just to be safe + trigger_new_conversion(file.id) + return {'status': 'pending', 'message': 'Conversion is being created'} + + if conversion.status == 'ready': + return FileResponse(conversion.target_path) + + elif conversion.status == 'failed': + # Optional: Auto-retry + trigger_new_conversion(file.id) + return {'status': 'failed', 'error': conversion.error_message} + + else: # pending or processing + return {'status': conversion.status, 'message': 'Please wait...'} + + # Serve other file types directly + return FileResponse(file.original_path) +``` + +## 4. Docker Setup + +```yaml +version: '3.8' + +services: + # Your API Server + api: + build: ./api + command: uvicorn main:app --host 0.0.0.0 --port 8000 + ports: + - "8000:8000" + volumes: + - ./data/uploads:/data/uploads + - ./data/converted:/data/converted + environment: + - REDIS_URL=redis://redis:6379 + - DATABASE_URL=postgresql://postgres:password@postgres:5432/infoscreen + depends_on: + - redis + - postgres + restart: unless-stopped + + # Worker (same codebase as API, different command) + worker: + build: ./api # Same build as API! + command: python worker.py # or: rq worker + volumes: + - ./data/uploads:/data/uploads + - ./data/converted:/data/converted + environment: + - REDIS_URL=redis://redis:6379 + - DATABASE_URL=postgresql://postgres:password@postgres:5432/infoscreen + depends_on: + - redis + - postgres + restart: unless-stopped + # Optional: Multiple workers + deploy: + replicas: 2 + + # Redis - separate container + redis: + image: redis:7-alpine + volumes: + - redis-data:/data + # Optional: persistent configuration + command: redis-server --appendonly yes + restart: unless-stopped + + # Your existing Postgres + postgres: + image: postgres:15 + environment: + - POSTGRES_DB=infoscreen + - POSTGRES_PASSWORD=password + volumes: + - postgres-data:/var/lib/postgresql/data + restart: unless-stopped + + # Optional: Redis Commander (UI for debugging) + redis-commander: + image: rediscommander/redis-commander + environment: + - REDIS_HOSTS=local:redis:6379 + ports: + - "8081:8081" + depends_on: + - redis + +volumes: + redis-data: + postgres-data: +``` + +## 5. Container Communication + +Containers communicate via **Docker's internal network**: + +```python +# In your API/Worker code: +import redis + +# Connection to Redis +redis_client = redis.from_url('redis://redis:6379') +# ^^^^^^ +# Container name = hostname in Docker network +``` + +Docker automatically creates DNS entries, so `redis` resolves to the Redis container. + +## 6. Client Behavior (Pi5) + +```python +# On the Pi5 client +def display_file(file_id): + response = api.get(f"/files/{file_id}/display") + + if response.content_type == 'application/pdf': + # PDF is ready + download_and_display(response) + subprocess.run(['impressive', downloaded_pdf]) + + elif response.json()['status'] in ['pending', 'processing']: + # Wait and retry + show_loading_screen("Presentation is being prepared...") + time.sleep(5) + display_file(file_id) # Retry + + else: + # Error + show_error_screen("Error loading presentation") +``` + +## 7. Additional Features + +### **Cache Invalidation on PPTX Update:** + +```python +@app.put("/files/{file_id}") +async def update_file(file_id, new_file): + # Delete old conversions + db.mark_conversions_as_obsolete(file_id) + + # Update file + update_media_file(file_id, new_file) + + # Trigger new conversion + trigger_conversion(file_id, 'pdf') +``` + +### **Status API for Monitoring:** + +```python +@app.get("/admin/conversions/status") +async def get_conversion_stats(): + return { + 'pending': db.count(status='pending'), + 'processing': db.count(status='processing'), + 'failed': db.count(status='failed'), + 'avg_duration_seconds': db.avg_duration() + } +``` + +### **Cleanup Job (Cronjob):** + +```python +def cleanup_old_conversions(): + # Remove PDFs from deleted files + db.delete_orphaned_conversions() + + # Clean up old failed conversions + db.delete_old_failed_conversions(older_than_days=7) +``` + +## 8. Redis Container Details + +### **Why Separate Container?** + +βœ… **Separation of Concerns**: Each service has its own responsibility +βœ… **Independent Lifecycle Management**: Redis can be restarted/updated independently +βœ… **Better Scaling**: Redis can be moved to different hardware +βœ… **Easier Backup**: Redis data can be backed up separately +βœ… **Standard Docker Pattern**: Microservices architecture + +### **Resource Usage:** +- RAM: ~10-50 MB for your use case +- CPU: Minimal +- Disk: Only for persistence (optional) + +For 10 clients with occasional PPTX uploads, this is absolutely no problem. + +## 9. Advantages of This Solution + +βœ… **Scalable**: Workers can be scaled horizontally +βœ… **Performant**: Clients don't wait for conversion +βœ… **Robust**: Status tracking and error handling +βœ… **Maintainable**: Clear separation of responsibilities +βœ… **Transparent**: Status queryable at any time +βœ… **Efficient**: One-time conversion per file +βœ… **Future-proof**: Easily extensible for other formats +βœ… **Professional**: Industry-standard architecture + +## 10. Migration Path + +### **Phase 1 (MVP):** +- 1 worker process in API container +- Redis for queue (separate container) +- Basic DB schema +- Simple retry logic + +### **Phase 2 (as needed):** +- Multiple worker instances +- Dedicated conversion service container +- Monitoring & alerting +- Prioritization logic +- Advanced caching strategies + +**Start simple, scale when needed!** + +## 11. Key Decisions Summary + +| Aspect | Decision | Reason | +|--------|----------|--------| +| **Conversion Location** | Server-side | One conversion per file, consistent results | +| **Conversion Timing** | Asynchronous (on upload) | No client waiting time, predictable performance | +| **Data Storage** | Database-tracked | Status visibility, robust error handling | +| **Queue System** | Redis (separate container) | Standard pattern, scalable, maintainable | +| **Worker Architecture** | Background process in API container | Simple start, easy to separate later | + +## 12. File Flow Diagram + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ User Upload β”‚ +β”‚ (PPTX) β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ API Server β”‚ +β”‚ 1. Save PPTX β”‚ +β”‚ 2. Create DB rec β”‚ +β”‚ 3. Enqueue job β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Redis Queue │◄─────┐ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ + β–Ό β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ Worker Process β”‚ β”‚ +β”‚ 1. Get job β”‚ β”‚ +β”‚ 2. Convert PPTX β”‚ β”‚ +β”‚ 3. Update DB β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ + β–Ό β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ PDF Storage β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ + β–Ό β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ Client Requests β”‚ β”‚ +β”‚ 1. Check DB β”‚ β”‚ +β”‚ 2. Download PDF β”‚ β”‚ +β”‚ 3. Display β”‚β”€β”€β”€β”€β”€β”€β”˜ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + (via impressive) +``` + +## 13. Implementation Checklist + +### Database Setup +- [ ] Create `media_files` table +- [ ] Create `conversions` table +- [ ] Add indexes for performance +- [ ] Set up foreign key constraints + +### API Changes +- [ ] Modify upload endpoint to create DB records +- [ ] Add conversion job enqueueing +- [ ] Implement file download endpoint with status checking +- [ ] Add status API for monitoring +- [ ] Implement cache invalidation on file update + +### Worker Setup +- [ ] Create worker script/module +- [ ] Implement LibreOffice conversion logic +- [ ] Add error handling and retry logic +- [ ] Set up logging and monitoring + +### Docker Configuration +- [ ] Add Redis container to docker-compose.yml +- [ ] Configure worker container +- [ ] Set up volume mounts for file storage +- [ ] Configure environment variables +- [ ] Set up container dependencies + +### Client Updates +- [ ] Modify client to check conversion status +- [ ] Implement retry logic for pending conversions +- [ ] Add loading/waiting screens +- [ ] Implement error handling + +### Testing +- [ ] Test upload β†’ conversion β†’ download flow +- [ ] Test multiple concurrent conversions +- [ ] Test error handling (corrupted PPTX, etc.) +- [ ] Test cache invalidation on file update +- [ ] Load test with multiple clients + +### Monitoring & Operations +- [ ] Set up logging for conversions +- [ ] Implement cleanup job for old files +- [ ] Add metrics for conversion times +- [ ] Set up alerts for failed conversions +- [ ] Document backup procedures + +--- + +**This architecture provides a solid foundation that's simple to start with but scales professionally as your needs grow!** \ No newline at end of file diff --git a/pptx_conversion_guide_gotenberg.md b/pptx_conversion_guide_gotenberg.md new file mode 100644 index 0000000..259e4bb --- /dev/null +++ b/pptx_conversion_guide_gotenberg.md @@ -0,0 +1,815 @@ +# Recommended Implementation: PPTX-to-PDF Conversion System with Gotenberg + +## Architecture Overview + +**Asynchronous server-side conversion using Gotenberg with shared storage** + +``` +User Upload β†’ API saves PPTX β†’ Job in Queue β†’ Worker calls Gotenberg API + ↓ + Gotenberg converts via shared volume + ↓ +Client requests β†’ API checks DB status β†’ PDF ready? β†’ Download PDF from shared storage + β†’ Pending? β†’ "Please wait" + β†’ Failed? β†’ Retry/Error +``` + +## 1. Database Schema + +```sql +CREATE TABLE media_files ( + id UUID PRIMARY KEY, + filename VARCHAR(255), + original_path VARCHAR(512), + file_type VARCHAR(10), + mime_type VARCHAR(100), + uploaded_at TIMESTAMP, + updated_at TIMESTAMP +); + +CREATE TABLE conversions ( + id UUID PRIMARY KEY, + source_file_id UUID REFERENCES media_files(id) ON DELETE CASCADE, + target_format VARCHAR(10), -- 'pdf' + target_path VARCHAR(512), -- Path to generated PDF + status VARCHAR(20), -- 'pending', 'processing', 'ready', 'failed' + started_at TIMESTAMP, + completed_at TIMESTAMP, + error_message TEXT, + file_hash VARCHAR(64) -- Hash of PPTX for cache invalidation +); + +CREATE INDEX idx_conversions_source ON conversions(source_file_id, target_format); +``` + +## 2. Components + +### **API Server (existing)** +- Accepts uploads +- Creates DB entries +- Enqueues jobs +- Delivers status and files + +### **Background Worker (new)** +- Runs as separate process in **same container** as API +- Processes conversion jobs from queue +- Calls Gotenberg API for conversion +- Updates database with results +- Technology: Python RQ, Celery, or similar + +### **Gotenberg Container (new)** +- Dedicated conversion service +- HTTP API for document conversion +- Handles LibreOffice conversions internally +- Accesses files via shared volume + +### **Message Queue** +- Redis (recommended for start - simple, fast) +- Alternative: RabbitMQ for more features + +### **Redis Container (separate)** +- Handles job queue +- Minimal resource footprint + +### **Shared Storage** +- Docker volume mounted to all containers that need file access +- API, Worker, and Gotenberg all access same files +- Simplifies file exchange between services + +## 3. Detailed Workflow + +### **Upload Process:** + +```python +@app.post("/upload") +async def upload_file(file): + # 1. Save PPTX to shared volume + file_path = save_to_disk(file) # e.g., /shared/uploads/abc123.pptx + + # 2. DB entry for original file + file_record = db.create_media_file({ + 'filename': file.filename, + 'original_path': file_path, + 'file_type': 'pptx' + }) + + # 3. Create conversion record + conversion = db.create_conversion({ + 'source_file_id': file_record.id, + 'target_format': 'pdf', + 'status': 'pending', + 'file_hash': calculate_hash(file_path) + }) + + # 4. Enqueue job (asynchronous!) + queue.enqueue(convert_to_pdf_via_gotenberg, conversion.id) + + # 5. Return immediately to user + return { + 'file_id': file_record.id, + 'status': 'uploaded', + 'conversion_status': 'pending' + } +``` + +### **Worker Process (calls Gotenberg):** + +```python +import requests +import os + +GOTENBERG_URL = os.getenv('GOTENBERG_URL', 'http://gotenberg:3000') + +def convert_to_pdf_via_gotenberg(conversion_id): + conversion = db.get_conversion(conversion_id) + source_file = db.get_media_file(conversion.source_file_id) + + # Status update: processing + db.update_conversion(conversion_id, { + 'status': 'processing', + 'started_at': now() + }) + + try: + # Prepare output path + pdf_filename = f"{conversion.id}.pdf" + pdf_path = f"/shared/converted/{pdf_filename}" + + # Call Gotenberg API + # Gotenberg accesses the file via shared volume + with open(source_file.original_path, 'rb') as f: + files = { + 'files': (os.path.basename(source_file.original_path), f) + } + + response = requests.post( + f'{GOTENBERG_URL}/forms/libreoffice/convert', + files=files, + timeout=300 # 5 minutes timeout + ) + response.raise_for_status() + + # Save PDF to shared volume + with open(pdf_path, 'wb') as pdf_file: + pdf_file.write(response.content) + + # Success + db.update_conversion(conversion_id, { + 'status': 'ready', + 'target_path': pdf_path, + 'completed_at': now() + }) + + except requests.exceptions.Timeout: + db.update_conversion(conversion_id, { + 'status': 'failed', + 'error_message': 'Conversion timeout after 5 minutes', + 'completed_at': now() + }) + except requests.exceptions.RequestException as e: + db.update_conversion(conversion_id, { + 'status': 'failed', + 'error_message': f'Gotenberg API error: {str(e)}', + 'completed_at': now() + }) + except Exception as e: + db.update_conversion(conversion_id, { + 'status': 'failed', + 'error_message': str(e), + 'completed_at': now() + }) +``` + +### **Alternative: Direct File Access via Shared Volume** + +If you prefer Gotenberg to read from shared storage directly (more efficient for large files): + +```python +def convert_to_pdf_via_gotenberg_shared(conversion_id): + conversion = db.get_conversion(conversion_id) + source_file = db.get_media_file(conversion.source_file_id) + + db.update_conversion(conversion_id, { + 'status': 'processing', + 'started_at': now() + }) + + try: + pdf_filename = f"{conversion.id}.pdf" + pdf_path = f"/shared/converted/{pdf_filename}" + + # Gotenberg reads directly from shared volume + # We just tell it where to find the file + with open(source_file.original_path, 'rb') as f: + files = {'files': f} + + response = requests.post( + f'{GOTENBERG_URL}/forms/libreoffice/convert', + files=files, + timeout=300 + ) + response.raise_for_status() + + # Write result to shared volume + with open(pdf_path, 'wb') as pdf_file: + pdf_file.write(response.content) + + db.update_conversion(conversion_id, { + 'status': 'ready', + 'target_path': pdf_path, + 'completed_at': now() + }) + + except Exception as e: + db.update_conversion(conversion_id, { + 'status': 'failed', + 'error_message': str(e), + 'completed_at': now() + }) +``` + +### **Client Download:** + +```python +@app.get("/files/{file_id}/display") +async def get_display_file(file_id): + file = db.get_media_file(file_id) + + # Only for PPTX: check PDF conversion + if file.file_type == 'pptx': + conversion = db.get_latest_conversion(file.id, target_format='pdf') + + if not conversion: + # Shouldn't happen, but just to be safe + trigger_new_conversion(file.id) + return {'status': 'pending', 'message': 'Conversion is being created'} + + if conversion.status == 'ready': + # Serve PDF from shared storage + return FileResponse(conversion.target_path) + + elif conversion.status == 'failed': + # Optional: Auto-retry + trigger_new_conversion(file.id) + return {'status': 'failed', 'error': conversion.error_message} + + else: # pending or processing + return {'status': conversion.status, 'message': 'Please wait...'} + + # Serve other file types directly + return FileResponse(file.original_path) +``` + +## 4. Docker Setup + +```yaml +version: '3.8' + +services: + # Your API Server + api: + build: ./api + command: uvicorn main:app --host 0.0.0.0 --port 8000 + ports: + - "8000:8000" + volumes: + - shared-storage:/shared # Shared volume + environment: + - REDIS_URL=redis://redis:6379 + - DATABASE_URL=postgresql://postgres:password@postgres:5432/infoscreen + - GOTENBERG_URL=http://gotenberg:3000 + depends_on: + - redis + - postgres + - gotenberg + restart: unless-stopped + + # Worker (same codebase as API, different command) + worker: + build: ./api # Same build as API! + command: python worker.py # or: rq worker + volumes: + - shared-storage:/shared # Shared volume + environment: + - REDIS_URL=redis://redis:6379 + - DATABASE_URL=postgresql://postgres:password@postgres:5432/infoscreen + - GOTENBERG_URL=http://gotenberg:3000 + depends_on: + - redis + - postgres + - gotenberg + restart: unless-stopped + # Optional: Multiple workers + deploy: + replicas: 2 + + # Gotenberg - Document Conversion Service + gotenberg: + image: gotenberg/gotenberg:8 + # Gotenberg doesn't need the shared volume if files are sent via HTTP + # But mount it if you want direct file access + volumes: + - shared-storage:/shared # Optional: for direct file access + environment: + # Gotenberg configuration + - GOTENBERG_API_TIMEOUT=300s + - GOTENBERG_LOG_LEVEL=info + restart: unless-stopped + # Resource limits (optional but recommended) + deploy: + resources: + limits: + cpus: '2.0' + memory: 2G + reservations: + cpus: '0.5' + memory: 512M + + # Redis - separate container + redis: + image: redis:7-alpine + volumes: + - redis-data:/data + command: redis-server --appendonly yes + restart: unless-stopped + + # Your existing Postgres + postgres: + image: postgres:15 + environment: + - POSTGRES_DB=infoscreen + - POSTGRES_PASSWORD=password + volumes: + - postgres-data:/var/lib/postgresql/data + restart: unless-stopped + + # Optional: Redis Commander (UI for debugging) + redis-commander: + image: rediscommander/redis-commander + environment: + - REDIS_HOSTS=local:redis:6379 + ports: + - "8081:8081" + depends_on: + - redis + +volumes: + shared-storage: # New: Shared storage for all file operations + redis-data: + postgres-data: +``` + +## 5. Storage Structure + +``` +/shared/ +β”œβ”€β”€ uploads/ # Original uploaded files (PPTX, etc.) +β”‚ β”œβ”€β”€ abc123.pptx +β”‚ β”œβ”€β”€ def456.pptx +β”‚ └── ... +└── converted/ # Converted PDF files + β”œβ”€β”€ uuid-1.pdf + β”œβ”€β”€ uuid-2.pdf + └── ... +``` + +## 6. Gotenberg Integration Details + +### **Gotenberg API Endpoints:** + +Gotenberg provides various conversion endpoints: + +```python +# LibreOffice conversion (for PPTX, DOCX, ODT, etc.) +POST http://gotenberg:3000/forms/libreoffice/convert + +# HTML to PDF +POST http://gotenberg:3000/forms/chromium/convert/html + +# Markdown to PDF +POST http://gotenberg:3000/forms/chromium/convert/markdown + +# Merge PDFs +POST http://gotenberg:3000/forms/pdfengines/merge +``` + +### **Example Conversion Request:** + +```python +import requests + +def convert_with_gotenberg(input_file_path, output_file_path): + """ + Convert document using Gotenberg + """ + with open(input_file_path, 'rb') as f: + files = { + 'files': (os.path.basename(input_file_path), f, + 'application/vnd.openxmlformats-officedocument.presentationml.presentation') + } + + # Optional: Add conversion parameters + data = { + 'landscape': 'false', # Portrait mode + 'nativePageRanges': '1-', # All pages + } + + response = requests.post( + 'http://gotenberg:3000/forms/libreoffice/convert', + files=files, + data=data, + timeout=300 + ) + + if response.status_code == 200: + with open(output_file_path, 'wb') as out: + out.write(response.content) + return True + else: + raise Exception(f"Gotenberg error: {response.status_code} - {response.text}") +``` + +### **Advanced Options:** + +```python +# With custom PDF properties +data = { + 'landscape': 'false', + 'nativePageRanges': '1-10', # Only first 10 pages + 'pdfFormat': 'PDF/A-1a', # PDF/A format + 'exportFormFields': 'false', +} + +# With password protection +data = { + 'userPassword': 'secret123', + 'ownerPassword': 'admin456', +} +``` + +## 7. Client Behavior (Pi5) + +```python +# On the Pi5 client +def display_file(file_id): + response = api.get(f"/files/{file_id}/display") + + if response.content_type == 'application/pdf': + # PDF is ready + download_and_display(response) + subprocess.run(['impressive', downloaded_pdf]) + + elif response.json()['status'] in ['pending', 'processing']: + # Wait and retry + show_loading_screen("Presentation is being prepared...") + time.sleep(5) + display_file(file_id) # Retry + + else: + # Error + show_error_screen("Error loading presentation") +``` + +## 8. Additional Features + +### **Cache Invalidation on PPTX Update:** + +```python +@app.put("/files/{file_id}") +async def update_file(file_id, new_file): + # Delete old conversions and PDFs + conversions = db.get_conversions_for_file(file_id) + for conv in conversions: + if conv.target_path and os.path.exists(conv.target_path): + os.remove(conv.target_path) + + db.mark_conversions_as_obsolete(file_id) + + # Update file + update_media_file(file_id, new_file) + + # Trigger new conversion + trigger_conversion(file_id, 'pdf') +``` + +### **Status API for Monitoring:** + +```python +@app.get("/admin/conversions/status") +async def get_conversion_stats(): + return { + 'pending': db.count(status='pending'), + 'processing': db.count(status='processing'), + 'failed': db.count(status='failed'), + 'avg_duration_seconds': db.avg_duration(), + 'gotenberg_health': check_gotenberg_health() + } + +def check_gotenberg_health(): + try: + response = requests.get( + f'{GOTENBERG_URL}/health', + timeout=5 + ) + return response.status_code == 200 + except: + return False +``` + +### **Cleanup Job (Cronjob):** + +```python +def cleanup_old_conversions(): + # Remove PDFs from deleted files + orphaned = db.get_orphaned_conversions() + for conv in orphaned: + if conv.target_path and os.path.exists(conv.target_path): + os.remove(conv.target_path) + db.delete_conversion(conv.id) + + # Clean up old failed conversions + old_failed = db.get_old_failed_conversions(older_than_days=7) + for conv in old_failed: + db.delete_conversion(conv.id) +``` + +## 9. Advantages of Using Gotenberg + +βœ… **Specialized Service**: Optimized specifically for document conversion +βœ… **No LibreOffice Management**: Gotenberg handles LibreOffice lifecycle internally +βœ… **Better Resource Management**: Isolated conversion process +βœ… **HTTP API**: Clean, standard interface +βœ… **Production Ready**: Battle-tested, actively maintained +βœ… **Multiple Formats**: Supports PPTX, DOCX, ODT, HTML, Markdown, etc. +βœ… **PDF Features**: Merge, encrypt, watermark PDFs +βœ… **Health Checks**: Built-in health endpoint +βœ… **Horizontal Scaling**: Can run multiple Gotenberg instances +βœ… **Memory Safe**: Automatic cleanup and restart on issues + +## 10. Migration Path + +### **Phase 1 (MVP):** +- 1 worker process in API container +- Redis for queue (separate container) +- Gotenberg for conversion (separate container) +- Basic DB schema +- Shared volume for file exchange +- Simple retry logic + +### **Phase 2 (as needed):** +- Multiple worker instances +- Multiple Gotenberg instances (load balancing) +- Monitoring & alerting +- Prioritization logic +- Advanced caching strategies +- PDF optimization/compression + +**Start simple, scale when needed!** + +## 11. Key Decisions Summary + +| Aspect | Decision | Reason | +|--------|----------|--------| +| **Conversion Location** | Server-side (Gotenberg) | One conversion per file, consistent results | +| **Conversion Service** | Dedicated Gotenberg container | Specialized, production-ready, better isolation | +| **Conversion Timing** | Asynchronous (on upload) | No client waiting time, predictable performance | +| **Data Storage** | Database-tracked | Status visibility, robust error handling | +| **File Exchange** | Shared Docker volume | Simple, efficient, no network overhead | +| **Queue System** | Redis (separate container) | Standard pattern, scalable, maintainable | +| **Worker Architecture** | Background process in API container | Simple start, easy to separate later | + +## 12. File Flow Diagram + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ User Upload β”‚ +β”‚ (PPTX) β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ API Server β”‚ +β”‚ 1. Save to /shared β”‚ +β”‚ 2. Create DB record β”‚ +β”‚ 3. Enqueue job β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Redis Queue β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Worker Process β”‚ +β”‚ 1. Get job β”‚ +β”‚ 2. Call Gotenberg β”‚ +β”‚ 3. Update DB β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Gotenberg β”‚ +β”‚ 1. Read from /shared β”‚ +β”‚ 2. Convert PPTX β”‚ +β”‚ 3. Return PDF β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Worker saves PDF β”‚ +β”‚ to /shared/convertedβ”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Client Requests β”‚ +β”‚ 1. Check DB β”‚ +β”‚ 2. Download PDF β”‚ +β”‚ 3. Display β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + (via impressive) +``` + +## 13. Implementation Checklist + +### Database Setup +- [ ] Create `media_files` table +- [ ] Create `conversions` table +- [ ] Add indexes for performance +- [ ] Set up foreign key constraints + +### Storage Setup +- [ ] Create shared Docker volume +- [ ] Set up directory structure (/shared/uploads, /shared/converted) +- [ ] Configure proper permissions + +### API Changes +- [ ] Modify upload endpoint to save to shared storage +- [ ] Create DB records for uploads +- [ ] Add conversion job enqueueing +- [ ] Implement file download endpoint with status checking +- [ ] Add status API for monitoring +- [ ] Implement cache invalidation on file update + +### Worker Setup +- [ ] Create worker script/module +- [ ] Implement Gotenberg API calls +- [ ] Add error handling and retry logic +- [ ] Set up logging and monitoring +- [ ] Handle timeouts and failures + +### Docker Configuration +- [ ] Add Gotenberg container to docker-compose.yml +- [ ] Add Redis container to docker-compose.yml +- [ ] Configure worker container +- [ ] Set up shared volume mounts +- [ ] Configure environment variables +- [ ] Set up container dependencies +- [ ] Configure resource limits for Gotenberg + +### Client Updates +- [ ] Modify client to check conversion status +- [ ] Implement retry logic for pending conversions +- [ ] Add loading/waiting screens +- [ ] Implement error handling + +### Testing +- [ ] Test upload β†’ conversion β†’ download flow +- [ ] Test multiple concurrent conversions +- [ ] Test error handling (corrupted PPTX, etc.) +- [ ] Test Gotenberg timeout handling +- [ ] Test cache invalidation on file update +- [ ] Load test with multiple clients +- [ ] Test Gotenberg health checks + +### Monitoring & Operations +- [ ] Set up logging for conversions +- [ ] Monitor Gotenberg health endpoint +- [ ] Implement cleanup job for old files +- [ ] Add metrics for conversion times +- [ ] Set up alerts for failed conversions +- [ ] Monitor shared storage disk usage +- [ ] Document backup procedures + +### Security +- [ ] Validate file types before conversion +- [ ] Set file size limits +- [ ] Sanitize filenames +- [ ] Implement rate limiting +- [ ] Secure inter-container communication + +## 14. Gotenberg Configuration Options + +### **Environment Variables:** + +```yaml +gotenberg: + image: gotenberg/gotenberg:8 + environment: + # API Configuration + - GOTENBERG_API_TIMEOUT=300s + - GOTENBERG_API_PORT=3000 + + # Logging + - GOTENBERG_LOG_LEVEL=info # debug, info, warn, error + + # LibreOffice + - GOTENBERG_LIBREOFFICE_DISABLE_ROUTES=false + - GOTENBERG_LIBREOFFICE_AUTO_START=true + + # Chromium (if needed for HTML/Markdown) + - GOTENBERG_CHROMIUM_DISABLE_ROUTES=true # Disable if not needed + + # Resource limits + - GOTENBERG_LIBREOFFICE_MAX_QUEUE_SIZE=100 +``` + +### **Custom Gotenberg Configuration:** + +For advanced configurations, create a `gotenberg.yml`: + +```yaml +api: + timeout: 300s + port: 3000 + +libreoffice: + autoStart: true + maxQueueSize: 100 + +chromium: + disableRoutes: true +``` + +Mount it in docker-compose: + +```yaml +gotenberg: + image: gotenberg/gotenberg:8 + volumes: + - ./gotenberg.yml:/etc/gotenberg/config.yml:ro + - shared-storage:/shared +``` + +## 15. Troubleshooting + +### **Common Issues:** + +**Gotenberg timeout:** +```python +# Increase timeout for large files +response = requests.post( + f'{GOTENBERG_URL}/forms/libreoffice/convert', + files=files, + timeout=600 # 10 minutes for large PPTX +) +``` + +**Memory issues:** +```yaml +# Increase Gotenberg memory limit +gotenberg: + deploy: + resources: + limits: + memory: 4G +``` + +**File permission issues:** +```bash +# Ensure proper permissions on shared volume +chmod -R 755 /shared +chown -R 1000:1000 /shared +``` + +**Gotenberg not responding:** +```python +# Check health before conversion +def ensure_gotenberg_healthy(): + try: + response = requests.get(f'{GOTENBERG_URL}/health', timeout=5) + if response.status_code != 200: + raise Exception("Gotenberg unhealthy") + except Exception as e: + logger.error(f"Gotenberg health check failed: {e}") + raise +``` + +--- + +**This architecture provides a production-ready, scalable solution using Gotenberg as a specialized conversion service with efficient file sharing via Docker volumes!** + +## 16. Best Practices Specific to Infoscreen + +- Idempotency by content: Always compute a SHA‑256 of the uploaded source and include it in the unique key (source_event_media_id, target_format, file_hash). This prevents duplicate work for identical content and auto-busts cache on change. +- Strict MIME/type validation: Accept only .ppt, .pptx, .odp for conversion. Reject unknown types early. Consider reading the first bytes (magic) for extra safety. +- Bounded retries with jitter: Retry conversions on transient HTTP 5xx or timeouts up to N times with exponential backoff. Do not retry on 4xx or clear user errors. +- Output naming: Derive deterministic output paths under media/converted/, e.g., .pdf. Ensure no path traversal and sanitize names. +- Timeouts and size limits: Enforce server-side max upload size and per-job conversion timeout (e.g., 10 minutes). Return clear errors for oversized/long-running files. +- Isolation and quotas: Set CPU/memory limits for Gotenberg; consider a concurrency cap per worker to avoid DB starvation. +- Health probes before work: Check Gotenberg /health prior to enqueue spikes; fail-fast to avoid queue pile-ups when Gotenberg is down. +- Observability: Log job IDs, file hashes, durations, and sizes. Expose a small /api/conversions/status summary for operational visibility. +- Cleanup policy: Periodically delete orphaned conversions (media deleted) and failed jobs older than X days. Keep successful PDFs aligned with DB rows. +- Security: Never trust client paths; always resolve relative to the known media root. Do not expose the shared volume directly; serve via API only. +- Backpressure: If queue length exceeds a threshold, surface 503/β€œtry later” on new uploads or pause enqueue to protect the system. diff --git a/server/__init__.py b/server/__init__.py index e69de29..793d47d 100644 --- a/server/__init__.py +++ b/server/__init__.py @@ -0,0 +1,8 @@ +"""Server package initializer. + +Expose submodules required by external importers (e.g., RQ string paths). +""" + +# Ensure 'server.worker' is available as an attribute of the 'server' package +# so that RQ can resolve 'server.worker.convert_event_media_to_pdf'. +from . import worker # noqa: F401 diff --git a/server/alembic/versions/2b627d0885c3_merge_heads_after_conversions.py b/server/alembic/versions/2b627d0885c3_merge_heads_after_conversions.py new file mode 100644 index 0000000..610bc0d --- /dev/null +++ b/server/alembic/versions/2b627d0885c3_merge_heads_after_conversions.py @@ -0,0 +1,28 @@ +"""merge heads after conversions + +Revision ID: 2b627d0885c3 +Revises: 5b3c1a2f8d10, 8d1df7199cb7 +Create Date: 2025-10-06 20:27:53.974926 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '2b627d0885c3' +down_revision: Union[str, None] = ('5b3c1a2f8d10', '8d1df7199cb7') +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + pass + + +def downgrade() -> None: + """Downgrade schema.""" + pass diff --git a/server/alembic/versions/5b3c1a2f8d10_add_conversions_table.py b/server/alembic/versions/5b3c1a2f8d10_add_conversions_table.py new file mode 100644 index 0000000..7e68b56 --- /dev/null +++ b/server/alembic/versions/5b3c1a2f8d10_add_conversions_table.py @@ -0,0 +1,53 @@ +"""Add conversions table + +Revision ID: 5b3c1a2f8d10 +Revises: e6eaede720aa +Create Date: 2025-10-06 12:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5b3c1a2f8d10' +down_revision: Union[str, None] = 'e6eaede720aa' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'conversions', + sa.Column('id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('source_event_media_id', sa.Integer(), nullable=False), + sa.Column('target_format', sa.String(length=10), nullable=False), + sa.Column('target_path', sa.String(length=512), nullable=True), + sa.Column('status', sa.Enum('pending', 'processing', 'ready', 'failed', name='conversionstatus'), + nullable=False, server_default='pending'), + sa.Column('file_hash', sa.String(length=64), nullable=True), + sa.Column('started_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('completed_at', sa.TIMESTAMP(timezone=True), nullable=True), + sa.Column('error_message', sa.Text(), nullable=True), + sa.ForeignKeyConstraint(['source_event_media_id'], ['event_media.id'], + name='fk_conversions_event_media', ondelete='CASCADE'), + ) + + op.create_index('ix_conv_source_event_media_id', 'conversions', ['source_event_media_id']) + op.create_index('ix_conversions_target_format', 'conversions', ['target_format']) + op.create_index('ix_conv_status_target', 'conversions', ['status', 'target_format']) + op.create_index('ix_conv_source_target', 'conversions', ['source_event_media_id', 'target_format']) + + op.create_unique_constraint('uq_conv_source_target_hash', 'conversions', + ['source_event_media_id', 'target_format', 'file_hash']) + + +def downgrade() -> None: + op.drop_constraint('uq_conv_source_target_hash', 'conversions', type_='unique') + op.drop_index('ix_conv_source_target', table_name='conversions') + op.drop_index('ix_conv_status_target', table_name='conversions') + op.drop_index('ix_conversions_target_format', table_name='conversions') + op.drop_index('ix_conv_source_event_media_id', table_name='conversions') + op.drop_table('conversions') diff --git a/server/alembic/versions/b5a6c3d4e7f8_make_file_hash_not_null.py b/server/alembic/versions/b5a6c3d4e7f8_make_file_hash_not_null.py new file mode 100644 index 0000000..358b586 --- /dev/null +++ b/server/alembic/versions/b5a6c3d4e7f8_make_file_hash_not_null.py @@ -0,0 +1,40 @@ +"""Make conversions.file_hash NOT NULL + +Revision ID: b5a6c3d4e7f8 +Revises: 2b627d0885c3 +Create Date: 2025-10-06 21:05:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "b5a6c3d4e7f8" +down_revision: Union[str, None] = "2b627d0885c3" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Ensure no NULLs remain before altering nullability + op.execute("UPDATE conversions SET file_hash = '' WHERE file_hash IS NULL") + op.alter_column( + "conversions", + "file_hash", + existing_type=sa.String(length=64), + nullable=False, + existing_nullable=True, + ) + + +def downgrade() -> None: + op.alter_column( + "conversions", + "file_hash", + existing_type=sa.String(length=64), + nullable=True, + existing_nullable=False, + ) diff --git a/server/requirements.txt b/server/requirements.txt index 746e01d..1cf2cff 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -6,3 +6,6 @@ python-dotenv>=1.1.0 SQLAlchemy>=2.0.41 flask gunicorn +redis>=5.0.1 +rq>=1.16.2 +requests>=2.32.3 diff --git a/server/routes/conversions.py b/server/routes/conversions.py new file mode 100644 index 0000000..c6dc770 --- /dev/null +++ b/server/routes/conversions.py @@ -0,0 +1,94 @@ +from flask import Blueprint, jsonify, request +from server.database import Session +from models.models import Conversion, ConversionStatus, EventMedia, MediaType +from server.task_queue import get_queue +from server.worker import convert_event_media_to_pdf +from datetime import datetime, timezone +import hashlib + +conversions_bp = Blueprint("conversions", __name__, + url_prefix="/api/conversions") + + +def sha256_file(abs_path: str) -> str: + h = hashlib.sha256() + with open(abs_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + return h.hexdigest() + + +@conversions_bp.route("//pdf", methods=["POST"]) +def ensure_conversion(media_id: int): + session = Session() + try: + media = session.query(EventMedia).get(media_id) + if not media or not media.file_path: + return jsonify({"error": "Media not found or no file"}), 404 + + # Only enqueue for office presentation formats + if media.media_type not in {MediaType.ppt, MediaType.pptx, MediaType.odp}: + return jsonify({"message": "No conversion required for this media_type"}), 200 + + # Compute file hash + import os + base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + media_root = os.path.join(base_dir, "media") + abs_source = os.path.join(media_root, media.file_path) + file_hash = sha256_file(abs_source) + + # Find or create conversion row + conv = ( + session.query(Conversion) + .filter_by( + source_event_media_id=media.id, + target_format="pdf", + file_hash=file_hash, + ) + .one_or_none() + ) + if not conv: + conv = Conversion( + source_event_media_id=media.id, + target_format="pdf", + status=ConversionStatus.pending, + file_hash=file_hash, + ) + session.add(conv) + session.commit() + + # Enqueue if not already processing/ready + if conv.status in {ConversionStatus.pending, ConversionStatus.failed}: + q = get_queue() + job = q.enqueue(convert_event_media_to_pdf, conv.id) + return jsonify({"id": conv.id, "status": conv.status.value, "job_id": job.get_id()}), 202 + else: + return jsonify({"id": conv.id, "status": conv.status.value, "target_path": conv.target_path}), 200 + finally: + session.close() + + +@conversions_bp.route("//status", methods=["GET"]) +def conversion_status(media_id: int): + session = Session() + try: + conv = ( + session.query(Conversion) + .filter_by(source_event_media_id=media_id, target_format="pdf") + .order_by(Conversion.id.desc()) + .first() + ) + if not conv: + return jsonify({"status": "missing"}), 404 + return jsonify( + { + "id": conv.id, + "status": conv.status.value, + "target_path": conv.target_path, + "started_at": conv.started_at.isoformat() if conv.started_at else None, + "completed_at": conv.completed_at.isoformat() if conv.completed_at else None, + "error_message": conv.error_message, + } + ) + finally: + session.close() diff --git a/server/routes/eventmedia.py b/server/routes/eventmedia.py index 4a43d27..80a508b 100644 --- a/server/routes/eventmedia.py +++ b/server/routes/eventmedia.py @@ -1,7 +1,10 @@ from re import A from flask import Blueprint, request, jsonify, send_from_directory from server.database import Session -from models.models import EventMedia, MediaType +from models.models import EventMedia, MediaType, Conversion, ConversionStatus +from server.task_queue import get_queue +from server.worker import convert_event_media_to_pdf +import hashlib import os eventmedia_bp = Blueprint('eventmedia', __name__, url_prefix='/api/eventmedia') @@ -134,6 +137,41 @@ def filemanager_upload(): uploaded_at=datetime.now(timezone.utc) ) session.add(media) + session.commit() + + # Enqueue conversion for office presentation types + if media_type in {MediaType.ppt, MediaType.pptx, MediaType.odp}: + # compute file hash + h = hashlib.sha256() + with open(file_path, 'rb') as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + file_hash = h.hexdigest() + + # upsert Conversion row + conv = ( + session.query(Conversion) + .filter_by( + source_event_media_id=media.id, + target_format='pdf', + file_hash=file_hash, + ) + .one_or_none() + ) + if not conv: + conv = Conversion( + source_event_media_id=media.id, + target_format='pdf', + status=ConversionStatus.pending, + file_hash=file_hash, + ) + session.add(conv) + session.commit() + + if conv.status in {ConversionStatus.pending, ConversionStatus.failed}: + q = get_queue() + q.enqueue(convert_event_media_to_pdf, conv.id) + session.commit() return jsonify({'success': True}) diff --git a/server/routes/files.py b/server/routes/files.py index 675adac..3012e2e 100644 --- a/server/routes/files.py +++ b/server/routes/files.py @@ -55,3 +55,14 @@ def download_media_file(media_id: int, filename: str): served_name = os.path.basename(abs_path) session.close() return send_from_directory(directory, served_name, as_attachment=True) + + +@files_bp.route("/converted/", methods=["GET"]) +def download_converted(relpath: str): + """Serve converted files (e.g., PDFs) relative to media/converted.""" + abs_path = os.path.join(MEDIA_ROOT, relpath) + if not abs_path.startswith(MEDIA_ROOT): + return jsonify({"error": "Invalid path"}), 400 + if not os.path.isfile(abs_path): + return jsonify({"error": "File not found"}), 404 + return send_from_directory(os.path.dirname(abs_path), os.path.basename(abs_path), as_attachment=True) diff --git a/server/rq_worker.py b/server/rq_worker.py new file mode 100644 index 0000000..6d0cabf --- /dev/null +++ b/server/rq_worker.py @@ -0,0 +1,15 @@ +import os +from rq import Worker +from server.task_queue import get_queue, get_redis_url +import redis + + +def main(): + conn = redis.from_url(get_redis_url()) + # Single queue named 'conversions' + w = Worker([get_queue().name], connection=conn) + w.work(with_scheduler=True) + + +if __name__ == "__main__": + main() diff --git a/server/task_queue.py b/server/task_queue.py new file mode 100644 index 0000000..9250395 --- /dev/null +++ b/server/task_queue.py @@ -0,0 +1,14 @@ +import os +import redis +from rq import Queue + + +def get_redis_url() -> str: + # Default to local Redis service name in compose network + return os.getenv("REDIS_URL", "redis://redis:6379/0") + + +def get_queue(name: str = "conversions") -> Queue: + conn = redis.from_url(get_redis_url()) + # 10 minutes default + return Queue(name, connection=conn, default_timeout=600) diff --git a/server/worker.py b/server/worker.py new file mode 100644 index 0000000..e47e03c --- /dev/null +++ b/server/worker.py @@ -0,0 +1,94 @@ +import os +import traceback +from datetime import datetime, timezone + +import requests +from sqlalchemy.orm import Session as SASession + +from server.database import Session +from models.models import Conversion, ConversionStatus, EventMedia, MediaType + +GOTENBERG_URL = os.getenv("GOTENBERG_URL", "http://gotenberg:3000") + + +def _now(): + return datetime.now(timezone.utc) + + +def convert_event_media_to_pdf(conversion_id: int): + """ + Job entry point: convert a single EventMedia to PDF using Gotenberg. + + Steps: + - Load conversion + source media + - Set status=processing, started_at + - POST to Gotenberg /forms/libreoffice/convert with the source file bytes + - Save response bytes to target_path + - Set status=ready, completed_at, target_path + - On error: set status=failed, error_message + """ + session: SASession = Session() + try: + conv: Conversion = session.query(Conversion).get(conversion_id) + if not conv: + return + + media: EventMedia = session.query( + EventMedia).get(conv.source_event_media_id) + if not media or not media.file_path: + conv.status = ConversionStatus.failed + conv.error_message = "Source media or file_path missing" + conv.completed_at = _now() + session.commit() + return + + conv.status = ConversionStatus.processing + conv.started_at = _now() + session.commit() + + # Get the server directory (where this worker.py file is located) + server_dir = os.path.dirname(os.path.abspath(__file__)) + media_root = os.path.join(server_dir, "media") + abs_source = os.path.join(media_root, media.file_path) + # Output target under media/converted + converted_dir = os.path.join(media_root, "converted") + os.makedirs(converted_dir, exist_ok=True) + filename_wo_ext = os.path.splitext( + os.path.basename(media.file_path))[0] + pdf_name = f"{filename_wo_ext}.pdf" + abs_target = os.path.join(converted_dir, pdf_name) + + # Send to Gotenberg + with open(abs_source, "rb") as f: + files = {"files": (os.path.basename(abs_source), f)} + resp = requests.post( + f"{GOTENBERG_URL}/forms/libreoffice/convert", + files=files, + timeout=600, + ) + resp.raise_for_status() + + with open(abs_target, "wb") as out: + out.write(resp.content) + + conv.status = ConversionStatus.ready + # Store relative path under media/ + conv.target_path = os.path.relpath(abs_target, media_root) + conv.completed_at = _now() + session.commit() + except requests.exceptions.Timeout: + conv = session.query(Conversion).get(conversion_id) + if conv: + conv.status = ConversionStatus.failed + conv.error_message = "Conversion timeout" + conv.completed_at = _now() + session.commit() + except Exception as e: + conv = session.query(Conversion).get(conversion_id) + if conv: + conv.status = ConversionStatus.failed + conv.error_message = f"{e}\n{traceback.format_exc()}" + conv.completed_at = _now() + session.commit() + finally: + session.close() diff --git a/server/wsgi.py b/server/wsgi.py index 0b6fea6..cd599d5 100644 --- a/server/wsgi.py +++ b/server/wsgi.py @@ -2,6 +2,7 @@ from server.routes.eventmedia import eventmedia_bp from server.routes.files import files_bp from server.routes.events import events_bp +from server.routes.conversions import conversions_bp from server.routes.holidays import holidays_bp from server.routes.academic_periods import academic_periods_bp from server.routes.groups import groups_bp @@ -24,6 +25,7 @@ app.register_blueprint(eventmedia_bp) app.register_blueprint(files_bp) app.register_blueprint(holidays_bp) app.register_blueprint(academic_periods_bp) +app.register_blueprint(conversions_bp) @app.route("/health")