Add stability hardening and self-heal recovery

This commit is contained in:
2026-04-05 10:29:37 +03:00
parent 9826d437a5
commit 143b7dca5d
18 changed files with 495 additions and 111 deletions

View File

@@ -30,6 +30,7 @@ systemctl enable bee-preflight.service
systemctl enable bee-audit.service
systemctl enable bee-web.service
systemctl enable bee-sshsetup.service
systemctl enable bee-selfheal.timer
systemctl enable ssh.service
systemctl enable lightdm.service 2>/dev/null || true
systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
chmod +x /usr/local/bin/bee 2>/dev/null || true
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
if [ "$GPU_VENDOR" = "nvidia" ]; then
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true

View File

@@ -171,6 +171,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
fi
done
if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
ok "timer active: bee-selfheal.timer"
else
fail "timer NOT active: bee-selfheal.timer"
fi
echo ""
echo "-- runtime health --"
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then

View File

@@ -0,0 +1,9 @@
[Unit]
Description=Bee: periodic runtime self-heal
After=bee-web.service bee-audit.service bee-preflight.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
StandardOutput=journal
StandardError=journal

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Bee: run self-heal checks periodically
[Timer]
OnBootSec=45sec
OnUnitActiveSec=60sec
AccuracySec=15sec
Unit=bee-selfheal.service
[Install]
WantedBy=timers.target

View File

@@ -1,11 +1,12 @@
[Unit]
Description=Bee: hardware audit web viewer
StartLimitIntervalSec=0
[Service]
Type=simple
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
Restart=always
RestartSec=2
RestartSec=3
StandardOutput=journal
StandardError=journal
LimitMEMLOCK=infinity

View File

@@ -0,0 +1,99 @@
#!/bin/bash
# bee-selfheal — periodic best-effort recovery for critical live ISO services.
set -u
LOG_PREFIX="bee-selfheal"
EXPORT_DIR="/appdata/bee/export"
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
LOCK_DIR="/run/bee-selfheal.lock"
log() {
echo "[${LOG_PREFIX}] $*"
}
have_nvidia_gpu() {
lspci -nn 2>/dev/null | grep -qi '10de:'
}
service_active() {
systemctl is-active --quiet "$1" 2>/dev/null
}
restart_service() {
local svc="$1"
if systemctl restart "$svc" >/dev/null 2>&1; then
log "restarted ${svc}"
return 0
fi
log "WARN: failed to restart ${svc}"
return 1
}
file_ready() {
[ -s "$1" ]
}
artifact_state() {
local path="$1"
if [ -s "${path}" ]; then
echo "ready"
return 0
fi
if [ -e "${path}.tmp" ]; then
echo "interrupted"
return 0
fi
echo "missing"
}
web_healthy() {
bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
>/dev/null 2>&1
}
mkdir -p "${EXPORT_DIR}" /run
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
log "another self-heal run is already active"
exit 0
fi
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
log "start"
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
restart_service bee-nvidia.service || true
fi
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
if [ "${runtime_state}" != "ready" ]; then
if [ "${runtime_state}" = "interrupted" ]; then
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
else
log "runtime-health.json missing or empty"
fi
restart_service bee-preflight.service || true
fi
audit_state="$(artifact_state "${AUDIT_JSON}")"
if [ "${audit_state}" != "ready" ]; then
if [ "${audit_state}" = "interrupted" ]; then
log "bee-audit.json.tmp exists — interrupted audit write detected"
else
log "bee-audit.json missing or empty"
fi
restart_service bee-audit.service || true
fi
if ! service_active bee-web.service; then
log "bee-web.service is not active"
restart_service bee-web.service || true
elif ! web_healthy; then
log "bee-web health check failed"
restart_service bee-web.service || true
fi
log "done"