Improve MQTT resilience, clarify behavior, and minor UX
Switch to Paho v2 callbacks; add loop_start() and reconnect_delay_set() for auto-reconnect Rework on_connect/on_disconnect to v2 signatures; handle session_present and reconnection flows Gate heartbeats with client.is_connected() and add short retry on rc=4 (NO_CONN) Re-send discovery after reconnect; ensure re-subscription to all topics Add startup terminal message with ISO timestamp in simclient.py Docs: update README and Copilot instructions with reconnection/heartbeat guidance and benign rc=4 notes
This commit is contained in:
@@ -2,6 +2,9 @@
|
|||||||
# Copy this file to .env and fill in values appropriate for your environment.
|
# Copy this file to .env and fill in values appropriate for your environment.
|
||||||
|
|
||||||
# Environment
|
# Environment
|
||||||
|
# IMPORTANT: CEC TV control is automatically DISABLED when ENV=development
|
||||||
|
# to avoid constantly switching the TV on/off during testing.
|
||||||
|
# Set to 'production' to enable automatic CEC TV control.
|
||||||
ENV=development # development | production
|
ENV=development # development | production
|
||||||
DEBUG_MODE=0 # 1 to enable debug mode
|
DEBUG_MODE=0 # 1 to enable debug mode
|
||||||
LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
|
LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
|
||||||
@@ -12,17 +15,27 @@ MQTT_BROKER=<your-mqtt-broker-host-or-ip>
|
|||||||
MQTT_PORT=1883
|
MQTT_PORT=1883
|
||||||
|
|
||||||
# Timing Configuration (seconds)
|
# Timing Configuration (seconds)
|
||||||
HEARTBEAT_INTERVAL=60
|
HEARTBEAT_INTERVAL=60 # Heartbeat frequency in seconds
|
||||||
SCREENSHOT_INTERVAL=180
|
SCREENSHOT_INTERVAL=180 # Screenshot capture frequency in seconds
|
||||||
DISPLAY_CHECK_INTERVAL=15
|
DISPLAY_CHECK_INTERVAL=15 # Display Manager event check frequency in seconds
|
||||||
|
|
||||||
# File/API Server (used to download presentation files)
|
# File/API Server (used to download presentation files)
|
||||||
# If event URLs use host 'server', the client rewrites them to this server.
|
# By default, the client rewrites incoming file URLs that point to 'server' to this host.
|
||||||
# You can either set FILE_SERVER_BASE_URL (preferred) or FILE_SERVER_HOST+PORT+SCHEME.
|
# If not set, FILE_SERVER_HOST defaults to the same host as MQTT_BROKER.
|
||||||
FILE_SERVER_HOST=
|
# You can also set FILE_SERVER_BASE_URL (e.g., http://192.168.1.10:8000) to override entirely.
|
||||||
FILE_SERVER_PORT=8000
|
FILE_SERVER_HOST= # optional: e.g., 192.168.43.100
|
||||||
FILE_SERVER_SCHEME=http
|
FILE_SERVER_PORT=8000 # default port for API server
|
||||||
# FILE_SERVER_BASE_URL=http://<your-file-server-host>:8000 # optional, takes precedence
|
FILE_SERVER_SCHEME=http # http or https
|
||||||
|
# FILE_SERVER_BASE_URL= # optional: takes precedence over the above when set
|
||||||
|
|
||||||
|
# HDMI-CEC TV Control (optional)
|
||||||
|
# Automatically turn TV on/off based on event scheduling
|
||||||
|
# NOTE: CEC is automatically DISABLED when ENV=development to avoid constantly switching TV on/off during testing
|
||||||
|
CEC_ENABLED=true # Enable automatic TV power control in production (true/false)
|
||||||
|
CEC_DEVICE=0 # Target CEC device (TV, 0, etc.)
|
||||||
|
CEC_TURN_OFF_DELAY=30 # Seconds to wait before turning off TV after last event ends
|
||||||
|
CEC_POWER_ON_WAIT=5 # Seconds to wait after power ON command (for TV to boot up)
|
||||||
|
CEC_POWER_OFF_WAIT=5 # Seconds to wait after power OFF command (increased for slower TVs)
|
||||||
|
|
||||||
# Optional: MQTT authentication (if your broker requires username/password)
|
# Optional: MQTT authentication (if your broker requires username/password)
|
||||||
#MQTT_USERNAME=
|
#MQTT_USERNAME=
|
||||||
|
|||||||
5
.github/copilot-instructions.md
vendored
5
.github/copilot-instructions.md
vendored
@@ -33,6 +33,11 @@ The server now performs all PPTX → PDF conversion. The client only ever downlo
|
|||||||
### MQTT Communication Patterns
|
### MQTT Communication Patterns
|
||||||
- **Discovery**: `infoscreen/discovery` → `infoscreen/{client_id}/discovery_ack`
|
- **Discovery**: `infoscreen/discovery` → `infoscreen/{client_id}/discovery_ack`
|
||||||
- **Heartbeat**: Regular `infoscreen/{client_id}/heartbeat` messages
|
- **Heartbeat**: Regular `infoscreen/{client_id}/heartbeat` messages
|
||||||
|
### MQTT Reconnection & Heartbeat (Nov 2025)
|
||||||
|
- The client uses Paho MQTT v2 callback API with `client.loop_start()` and `client.reconnect_delay_set()` to handle automatic reconnection.
|
||||||
|
- `on_connect` re-subscribes to all topics (`discovery_ack`, `config`, `group_id`, current group events) and re-sends discovery on reconnect to re-register with the server.
|
||||||
|
- Heartbeats are gated by `client.is_connected()` and retry once on `NO_CONN` (rc=4). Occasional rc=4 warnings are normal right after broker restarts or brief network stalls and typically followed by a successful heartbeat.
|
||||||
|
- Do not treat single rc=4 heartbeat warnings as failures. Investigate only if multiple consecutive heartbeats fail without recovery.
|
||||||
- **Dashboard**: Screenshot transmission via `infoscreen/{client_id}/dashboard`
|
- **Dashboard**: Screenshot transmission via `infoscreen/{client_id}/dashboard`
|
||||||
- **Group Assignment**: Server sends group via `infoscreen/{client_id}/group_id`
|
- **Group Assignment**: Server sends group via `infoscreen/{client_id}/group_id`
|
||||||
- **Events**: Content commands via `infoscreen/events/{group_id}`
|
- **Events**: Content commands via `infoscreen/events/{group_id}`
|
||||||
|
|||||||
@@ -469,6 +469,13 @@ sudo systemctl status mosquitto
|
|||||||
**Try fallback brokers:**
|
**Try fallback brokers:**
|
||||||
Edit `.env` and add `MQTT_BROKER_FALLBACKS`
|
Edit `.env` and add `MQTT_BROKER_FALLBACKS`
|
||||||
|
|
||||||
|
#### Client auto-reconnect and heartbeat behavior (Nov 2025)
|
||||||
|
- The MQTT client now uses Paho v2 callbacks and `loop_start()` for automatic reconnection with exponential backoff.
|
||||||
|
- All topic subscriptions are restored in `on_connect` and a discovery message is re-sent on reconnect to re-register the client.
|
||||||
|
- Heartbeats are sent only when connected; if publish occurs during a brief reconnect window, Paho may return rc=4 (NO_CONN). The client performs a short retry and logs the outcome.
|
||||||
|
- Occasional `Heartbeat publish failed with code: 4` after broker restart or transient network hiccups is expected and not dangerous. It indicates "not connected at this instant"; the next heartbeat typically succeeds.
|
||||||
|
- When to investigate: repeated rc=4 with no succeeding "Heartbeat sent" entries over multiple intervals.
|
||||||
|
|
||||||
### Screenshots not uploading
|
### Screenshots not uploading
|
||||||
|
|
||||||
**Test screenshot capture:**
|
**Test screenshot capture:**
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ log_ok "Development tools installed"
|
|||||||
|
|
||||||
# 3. Display / presentation dependencies
|
# 3. Display / presentation dependencies
|
||||||
log_step "Installing presentation & display tools (incl. HDMI-CEC)..."
|
log_step "Installing presentation & display tools (incl. HDMI-CEC)..."
|
||||||
sudo apt install -y chromium-browser libreoffice vlc feh scrot imagemagick xdotool wmctrl cec-utils
|
sudo apt install -y chromium-browser libreoffice vlc feh scrot imagemagick xdotool wmctrl cec-utils impressive
|
||||||
log_ok "Presentation + HDMI-CEC tools installed (cec-utils)"
|
log_ok "Presentation + HDMI-CEC tools installed (cec-utils)"
|
||||||
|
|
||||||
# 4. MQTT tools
|
# 4. MQTT tools
|
||||||
|
|||||||
154
src/simclient.py
154
src/simclient.py
@@ -552,6 +552,7 @@ def screenshot_service_thread(client, client_id):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
global discovered
|
global discovered
|
||||||
|
print(f"[{datetime.now().isoformat()}] simclient.py: program started")
|
||||||
logging.info("Client starting - deleting old event file if present")
|
logging.info("Client starting - deleting old event file if present")
|
||||||
delete_event_file()
|
delete_event_file()
|
||||||
|
|
||||||
@@ -575,6 +576,18 @@ def main():
|
|||||||
client = mqtt.Client(**client_kwargs)
|
client = mqtt.Client(**client_kwargs)
|
||||||
client.on_message = on_message
|
client.on_message = on_message
|
||||||
|
|
||||||
|
# Enable automatic reconnection
|
||||||
|
client.reconnect_delay_set(min_delay=1, max_delay=120)
|
||||||
|
|
||||||
|
# Connection state tracking
|
||||||
|
connection_state = {"connected": False, "last_disconnect": None}
|
||||||
|
|
||||||
|
# Optional: Enable MQTT debug logging in DEBUG_MODE
|
||||||
|
if DEBUG_MODE:
|
||||||
|
def on_log(client, userdata, level, buf):
|
||||||
|
logging.debug(f"MQTT: {buf}")
|
||||||
|
client.on_log = on_log
|
||||||
|
|
||||||
# Define subscribe_event_topic BEFORE on_connect so it can be called from the callback
|
# Define subscribe_event_topic BEFORE on_connect so it can be called from the callback
|
||||||
def subscribe_event_topic(new_group_id):
|
def subscribe_event_topic(new_group_id):
|
||||||
nonlocal event_topic, current_group_id
|
nonlocal event_topic, current_group_id
|
||||||
@@ -613,7 +626,26 @@ def main():
|
|||||||
# on_connect callback: Subscribe to all topics after connection is established
|
# on_connect callback: Subscribe to all topics after connection is established
|
||||||
def on_connect(client, userdata, flags, rc, properties=None):
|
def on_connect(client, userdata, flags, rc, properties=None):
|
||||||
if rc == 0:
|
if rc == 0:
|
||||||
logging.info("MQTT connected successfully - subscribing to topics...")
|
connection_state["connected"] = True
|
||||||
|
connection_state["last_disconnect"] = None
|
||||||
|
|
||||||
|
# Check if this is a reconnection
|
||||||
|
# paho-mqtt v2 provides ConnectFlags with attribute 'session_present'
|
||||||
|
# Older versions may provide dict-like flags; default to False.
|
||||||
|
is_reconnect = False
|
||||||
|
try:
|
||||||
|
if hasattr(flags, "session_present"):
|
||||||
|
is_reconnect = bool(getattr(flags, "session_present"))
|
||||||
|
elif isinstance(flags, dict):
|
||||||
|
is_reconnect = bool(flags.get("session present", False))
|
||||||
|
except Exception:
|
||||||
|
is_reconnect = False
|
||||||
|
|
||||||
|
if is_reconnect:
|
||||||
|
logging.info("MQTT reconnected successfully - resubscribing to all topics...")
|
||||||
|
else:
|
||||||
|
logging.info("MQTT connected successfully - subscribing to topics...")
|
||||||
|
|
||||||
# Discovery-ACK-Topic abonnieren
|
# Discovery-ACK-Topic abonnieren
|
||||||
ack_topic = f"infoscreen/{client_id}/discovery_ack"
|
ack_topic = f"infoscreen/{client_id}/discovery_ack"
|
||||||
client.subscribe(ack_topic)
|
client.subscribe(ack_topic)
|
||||||
@@ -632,10 +664,28 @@ def main():
|
|||||||
if current_group_id:
|
if current_group_id:
|
||||||
logging.info(f"Subscribing to event topic for saved group_id: {current_group_id}")
|
logging.info(f"Subscribing to event topic for saved group_id: {current_group_id}")
|
||||||
subscribe_event_topic(current_group_id)
|
subscribe_event_topic(current_group_id)
|
||||||
|
|
||||||
|
# Send discovery message after reconnection to re-register with server
|
||||||
|
if is_reconnect:
|
||||||
|
logging.info("Sending discovery after reconnection to re-register with server")
|
||||||
|
send_discovery(client, client_id, hardware_token, ip_addr)
|
||||||
else:
|
else:
|
||||||
|
connection_state["connected"] = False
|
||||||
logging.error(f"MQTT connection failed with code: {rc}")
|
logging.error(f"MQTT connection failed with code: {rc}")
|
||||||
|
|
||||||
|
# on_disconnect callback (Paho v2 signature)
|
||||||
|
def on_disconnect(client, userdata, disconnect_flags, rc, properties=None):
|
||||||
|
connection_state["connected"] = False
|
||||||
|
connection_state["last_disconnect"] = time.time()
|
||||||
|
|
||||||
|
if rc == 0:
|
||||||
|
logging.info("MQTT disconnected cleanly")
|
||||||
|
else:
|
||||||
|
logging.warning(f"MQTT disconnected unexpectedly with code: {rc}")
|
||||||
|
logging.info("Automatic reconnection will be attempted...")
|
||||||
|
|
||||||
client.on_connect = on_connect
|
client.on_connect = on_connect
|
||||||
|
client.on_disconnect = on_disconnect
|
||||||
|
|
||||||
# Robust MQTT connect with fallbacks and retries
|
# Robust MQTT connect with fallbacks and retries
|
||||||
broker_candidates = [MQTT_BROKER]
|
broker_candidates = [MQTT_BROKER]
|
||||||
@@ -669,13 +719,22 @@ def main():
|
|||||||
logging.error(f"MQTT connection failed after multiple attempts: {last_error}")
|
logging.error(f"MQTT connection failed after multiple attempts: {last_error}")
|
||||||
raise last_error
|
raise last_error
|
||||||
|
|
||||||
|
# Start the network loop early to begin connection process
|
||||||
|
client.loop_start()
|
||||||
|
logging.info("MQTT network loop started - establishing connection...")
|
||||||
|
|
||||||
# Wait for connection to complete and on_connect callback to fire
|
# Wait for connection to complete and on_connect callback to fire
|
||||||
# This ensures subscriptions are set up before we start discovery
|
logging.info("Waiting for initial connection and subscription setup...")
|
||||||
logging.info("Waiting for on_connect callback and subscription setup...")
|
connection_timeout = 30 # seconds
|
||||||
for _ in range(10): # Wait up to ~1 second
|
start_wait = time.time()
|
||||||
client.loop(timeout=0.1)
|
while not connection_state["connected"] and (time.time() - start_wait) < connection_timeout:
|
||||||
time.sleep(0.1)
|
time.sleep(0.5)
|
||||||
logging.info("Subscription setup complete, starting discovery phase")
|
|
||||||
|
if not connection_state["connected"]:
|
||||||
|
logging.error(f"Failed to establish initial MQTT connection within {connection_timeout}s")
|
||||||
|
raise Exception("MQTT connection timeout")
|
||||||
|
|
||||||
|
logging.info("Initial connection established, subscription setup complete")
|
||||||
|
|
||||||
# group_id message callback
|
# group_id message callback
|
||||||
group_id_topic = f"infoscreen/{client_id}/group_id"
|
group_id_topic = f"infoscreen/{client_id}/group_id"
|
||||||
@@ -705,16 +764,27 @@ def main():
|
|||||||
logging.info(f"Current group_id at start: {current_group_id if current_group_id else 'none'}")
|
logging.info(f"Current group_id at start: {current_group_id if current_group_id else 'none'}")
|
||||||
|
|
||||||
# Discovery-Phase: Sende Discovery bis ACK empfangen
|
# Discovery-Phase: Sende Discovery bis ACK empfangen
|
||||||
while not discovered:
|
# The loop is already started, just wait and send discovery messages
|
||||||
send_discovery(client, client_id, hardware_token, ip_addr)
|
discovery_attempts = 0
|
||||||
# Check for messages and discovered flag more frequently
|
max_discovery_attempts = 20
|
||||||
for _ in range(int(HEARTBEAT_INTERVAL)):
|
while not discovered and discovery_attempts < max_discovery_attempts:
|
||||||
client.loop(timeout=1.0)
|
if connection_state["connected"]:
|
||||||
if discovered:
|
send_discovery(client, client_id, hardware_token, ip_addr)
|
||||||
break
|
discovery_attempts += 1
|
||||||
time.sleep(1)
|
# Wait for ACK, checking every second
|
||||||
|
for _ in range(int(HEARTBEAT_INTERVAL)):
|
||||||
|
if discovered:
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
else:
|
||||||
|
logging.info("Waiting for MQTT connection before sending discovery...")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
if discovered:
|
if discovered:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if not discovered:
|
||||||
|
logging.warning(f"Discovery ACK not received after {max_discovery_attempts} attempts - continuing anyway")
|
||||||
|
|
||||||
# Start screenshot service in background thread
|
# Start screenshot service in background thread
|
||||||
screenshot_thread = threading.Thread(
|
screenshot_thread = threading.Thread(
|
||||||
@@ -725,16 +795,54 @@ def main():
|
|||||||
screenshot_thread.start()
|
screenshot_thread.start()
|
||||||
logging.info("Screenshot service thread started")
|
logging.info("Screenshot service thread started")
|
||||||
|
|
||||||
# Heartbeat-Loop
|
# Heartbeat-Loop with connection state monitoring
|
||||||
last_heartbeat = 0
|
last_heartbeat = 0
|
||||||
|
logging.info("Entering heartbeat loop (network loop already running in background thread)")
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
current_time = time.time()
|
try:
|
||||||
if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
|
current_time = time.time()
|
||||||
client.publish(f"infoscreen/{client_id}/heartbeat", "alive")
|
|
||||||
logging.info("Heartbeat sent.")
|
# Check connection state and log warnings if disconnected
|
||||||
last_heartbeat = current_time
|
if not connection_state["connected"]:
|
||||||
client.loop(timeout=5.0)
|
if connection_state["last_disconnect"]:
|
||||||
time.sleep(5)
|
disconnect_duration = current_time - connection_state["last_disconnect"]
|
||||||
|
logging.warning(f"MQTT disconnected for {disconnect_duration:.1f}s - waiting for reconnection...")
|
||||||
|
else:
|
||||||
|
logging.warning("MQTT not connected - waiting for connection...")
|
||||||
|
|
||||||
|
# Send heartbeat only when connected
|
||||||
|
if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
|
||||||
|
if client.is_connected():
|
||||||
|
result = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
|
||||||
|
if result.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||||
|
logging.info("Heartbeat sent.")
|
||||||
|
elif result.rc == mqtt.MQTT_ERR_NO_CONN:
|
||||||
|
logging.debug("Heartbeat publish returned NO_CONN; retrying in 2s...")
|
||||||
|
time.sleep(2)
|
||||||
|
if client.is_connected():
|
||||||
|
retry = client.publish(f"infoscreen/{client_id}/heartbeat", "alive", qos=0)
|
||||||
|
if retry.rc == mqtt.MQTT_ERR_SUCCESS:
|
||||||
|
logging.info("Heartbeat sent after retry.")
|
||||||
|
else:
|
||||||
|
logging.warning(f"Heartbeat publish failed after retry with code: {retry.rc}")
|
||||||
|
else:
|
||||||
|
logging.warning("Skipping heartbeat retry - MQTT still not connected")
|
||||||
|
else:
|
||||||
|
logging.warning(f"Heartbeat publish failed with code: {result.rc}")
|
||||||
|
else:
|
||||||
|
logging.debug("Skipping heartbeat - MQTT not connected (is_connected=False)")
|
||||||
|
last_heartbeat = current_time
|
||||||
|
|
||||||
|
time.sleep(5)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logging.info("Shutting down gracefully...")
|
||||||
|
client.loop_stop()
|
||||||
|
client.disconnect()
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error in main loop: {e}")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user