Add stability hardening and self-heal recovery
This commit is contained in:
99
iso/overlay/usr/local/bin/bee-selfheal
Normal file
99
iso/overlay/usr/local/bin/bee-selfheal
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/bin/bash
|
||||
# bee-selfheal — periodic best-effort recovery for critical live ISO services.
|
||||
|
||||
set -u
|
||||
|
||||
LOG_PREFIX="bee-selfheal"
|
||||
EXPORT_DIR="/appdata/bee/export"
|
||||
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
|
||||
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
|
||||
LOCK_DIR="/run/bee-selfheal.lock"
|
||||
|
||||
log() {
|
||||
echo "[${LOG_PREFIX}] $*"
|
||||
}
|
||||
|
||||
have_nvidia_gpu() {
|
||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
||||
}
|
||||
|
||||
service_active() {
|
||||
systemctl is-active --quiet "$1" 2>/dev/null
|
||||
}
|
||||
|
||||
restart_service() {
|
||||
local svc="$1"
|
||||
if systemctl restart "$svc" >/dev/null 2>&1; then
|
||||
log "restarted ${svc}"
|
||||
return 0
|
||||
fi
|
||||
log "WARN: failed to restart ${svc}"
|
||||
return 1
|
||||
}
|
||||
|
||||
file_ready() {
|
||||
[ -s "$1" ]
|
||||
}
|
||||
|
||||
artifact_state() {
|
||||
local path="$1"
|
||||
if [ -s "${path}" ]; then
|
||||
echo "ready"
|
||||
return 0
|
||||
fi
|
||||
if [ -e "${path}.tmp" ]; then
|
||||
echo "interrupted"
|
||||
return 0
|
||||
fi
|
||||
echo "missing"
|
||||
}
|
||||
|
||||
web_healthy() {
|
||||
bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
|
||||
>/dev/null 2>&1
|
||||
}
|
||||
|
||||
mkdir -p "${EXPORT_DIR}" /run
|
||||
|
||||
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||||
log "another self-heal run is already active"
|
||||
exit 0
|
||||
fi
|
||||
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||
|
||||
log "start"
|
||||
|
||||
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||
restart_service bee-nvidia.service || true
|
||||
fi
|
||||
|
||||
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
|
||||
if [ "${runtime_state}" != "ready" ]; then
|
||||
if [ "${runtime_state}" = "interrupted" ]; then
|
||||
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
|
||||
else
|
||||
log "runtime-health.json missing or empty"
|
||||
fi
|
||||
restart_service bee-preflight.service || true
|
||||
fi
|
||||
|
||||
audit_state="$(artifact_state "${AUDIT_JSON}")"
|
||||
if [ "${audit_state}" != "ready" ]; then
|
||||
if [ "${audit_state}" = "interrupted" ]; then
|
||||
log "bee-audit.json.tmp exists — interrupted audit write detected"
|
||||
else
|
||||
log "bee-audit.json missing or empty"
|
||||
fi
|
||||
restart_service bee-audit.service || true
|
||||
fi
|
||||
|
||||
if ! service_active bee-web.service; then
|
||||
log "bee-web.service is not active"
|
||||
restart_service bee-web.service || true
|
||||
elif ! web_healthy; then
|
||||
log "bee-web health check failed"
|
||||
restart_service bee-web.service || true
|
||||
fi
|
||||
|
||||
log "done"
|
||||
Reference in New Issue
Block a user