#!/bin/bash
# bee-selfheal — periodic best-effort recovery for critical live ISO services.

set -u

LOG_PREFIX="bee-selfheal"
EXPORT_DIR="/appdata/bee/export"
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
LOCK_DIR="/run/bee-selfheal.lock"

log() {
    echo "[${LOG_PREFIX}] $*"
}

have_nvidia_gpu() {
    lspci -nn 2>/dev/null | grep -qi '10de:'
}

service_active() {
    systemctl is-active --quiet "$1" 2>/dev/null
}

restart_service() {
    local svc="$1"
    if systemctl restart "$svc" >/dev/null 2>&1; then
        log "restarted ${svc}"
        return 0
    fi
    log "WARN: failed to restart ${svc}"
    return 1
}

file_ready() {
    [ -s "$1" ]
}

artifact_state() {
    local path="$1"
    if [ -s "${path}" ]; then
        echo "ready"
        return 0
    fi
    if [ -e "${path}.tmp" ]; then
        echo "interrupted"
        return 0
    fi
    echo "missing"
}

web_healthy() {
    bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
        >/dev/null 2>&1
}

mkdir -p "${EXPORT_DIR}" /run

if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
    log "another self-heal run is already active"
    exit 0
fi
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT

log "start"

if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
    log "NVIDIA GPU detected but /dev/nvidia0 is missing"
    restart_service bee-nvidia.service || true
fi

runtime_state="$(artifact_state "${RUNTIME_JSON}")"
if [ "${runtime_state}" != "ready" ]; then
    if [ "${runtime_state}" = "interrupted" ]; then
        log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
    else
        log "runtime-health.json missing or empty"
    fi
    restart_service bee-preflight.service || true
fi

audit_state="$(artifact_state "${AUDIT_JSON}")"
if [ "${audit_state}" != "ready" ]; then
    if [ "${audit_state}" = "interrupted" ]; then
        log "bee-audit.json.tmp exists — interrupted audit write detected"
    else
        log "bee-audit.json missing or empty"
    fi
    restart_service bee-audit.service || true
fi

if ! service_active bee-web.service; then
    log "bee-web.service is not active"
    restart_service bee-web.service || true
elif ! web_healthy; then
    log "bee-web health check failed"
    restart_service bee-web.service || true
fi

log "done"
