#!/bin/bash # bee-selfheal — periodic best-effort recovery for critical live ISO services. set -u LOG_PREFIX="bee-selfheal" EXPORT_DIR="/appdata/bee/export" AUDIT_JSON="${EXPORT_DIR}/bee-audit.json" RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json" LOCK_DIR="/run/bee-selfheal.lock" log() { echo "[${LOG_PREFIX}] $*" } have_nvidia_gpu() { lspci -nn 2>/dev/null | grep -qi '10de:' } service_active() { systemctl is-active --quiet "$1" 2>/dev/null } restart_service() { local svc="$1" if systemctl restart "$svc" >/dev/null 2>&1; then log "restarted ${svc}" return 0 fi log "WARN: failed to restart ${svc}" return 1 } file_ready() { [ -s "$1" ] } artifact_state() { local path="$1" if [ -s "${path}" ]; then echo "ready" return 0 fi if [ -e "${path}.tmp" ]; then echo "interrupted" return 0 fi echo "missing" } web_healthy() { bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \ >/dev/null 2>&1 } mkdir -p "${EXPORT_DIR}" /run if ! mkdir "${LOCK_DIR}" 2>/dev/null; then log "another self-heal run is already active" exit 0 fi trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT log "start" if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then log "NVIDIA GPU detected but /dev/nvidia0 is missing" restart_service bee-nvidia.service || true fi runtime_state="$(artifact_state "${RUNTIME_JSON}")" if [ "${runtime_state}" != "ready" ]; then if [ "${runtime_state}" = "interrupted" ]; then log "runtime-health.json.tmp exists — interrupted runtime-health write detected" else log "runtime-health.json missing or empty" fi restart_service bee-preflight.service || true fi audit_state="$(artifact_state "${AUDIT_JSON}")" if [ "${audit_state}" != "ready" ]; then if [ "${audit_state}" = "interrupted" ]; then log "bee-audit.json.tmp exists — interrupted audit write detected" else log "bee-audit.json missing or empty" fi restart_service bee-audit.service || true fi if ! service_active bee-web.service; then log "bee-web.service is not active" restart_service bee-web.service || true elif ! web_healthy; then log "bee-web health check failed" restart_service bee-web.service || true fi log "done"