|
|
|
|
@@ -8,11 +8,17 @@ EXPORT_DIR="/appdata/bee/export"
|
|
|
|
|
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
|
|
|
|
|
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
|
|
|
|
|
LOCK_DIR="/run/bee-selfheal.lock"
|
|
|
|
|
EVENTS=0
|
|
|
|
|
|
|
|
|
|
log() {
|
|
|
|
|
echo "[${LOG_PREFIX}] $*"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
log_event() {
|
|
|
|
|
EVENTS=$((EVENTS + 1))
|
|
|
|
|
log "$*"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
have_nvidia_gpu() {
|
|
|
|
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
|
|
|
|
}
|
|
|
|
|
@@ -56,24 +62,22 @@ web_healthy() {
|
|
|
|
|
mkdir -p "${EXPORT_DIR}" /run
|
|
|
|
|
|
|
|
|
|
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
|
|
|
|
log "another self-heal run is already active"
|
|
|
|
|
log_event "another self-heal run is already active"
|
|
|
|
|
exit 0
|
|
|
|
|
fi
|
|
|
|
|
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
|
|
|
|
|
|
|
|
|
log "start"
|
|
|
|
|
|
|
|
|
|
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
|
|
|
|
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
|
|
|
|
log_event "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
|
|
|
|
restart_service bee-nvidia.service || true
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
|
|
|
|
|
if [ "${runtime_state}" != "ready" ]; then
|
|
|
|
|
if [ "${runtime_state}" = "interrupted" ]; then
|
|
|
|
|
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
|
|
|
|
|
log_event "runtime-health.json.tmp exists — interrupted runtime-health write detected"
|
|
|
|
|
else
|
|
|
|
|
log "runtime-health.json missing or empty"
|
|
|
|
|
log_event "runtime-health.json missing or empty"
|
|
|
|
|
fi
|
|
|
|
|
restart_service bee-preflight.service || true
|
|
|
|
|
fi
|
|
|
|
|
@@ -81,19 +85,17 @@ fi
|
|
|
|
|
audit_state="$(artifact_state "${AUDIT_JSON}")"
|
|
|
|
|
if [ "${audit_state}" != "ready" ]; then
|
|
|
|
|
if [ "${audit_state}" = "interrupted" ]; then
|
|
|
|
|
log "bee-audit.json.tmp exists — interrupted audit write detected"
|
|
|
|
|
log_event "bee-audit.json.tmp exists — interrupted audit write detected"
|
|
|
|
|
else
|
|
|
|
|
log "bee-audit.json missing or empty"
|
|
|
|
|
log_event "bee-audit.json missing or empty"
|
|
|
|
|
fi
|
|
|
|
|
restart_service bee-audit.service || true
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if ! service_active bee-web.service; then
|
|
|
|
|
log "bee-web.service is not active"
|
|
|
|
|
log_event "bee-web.service is not active"
|
|
|
|
|
restart_service bee-web.service || true
|
|
|
|
|
elif ! web_healthy; then
|
|
|
|
|
log "bee-web health check failed"
|
|
|
|
|
log_event "bee-web health check failed"
|
|
|
|
|
restart_service bee-web.service || true
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
log "done"
|
|
|
|
|
|