From 143b7dca5d3f8ab3a41ac2b559a31dcd48b0f005 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 5 Apr 2026 10:29:37 +0300 Subject: [PATCH] Add stability hardening and self-heal recovery --- audit/cmd/bee/main.go | 14 +- audit/internal/app/app.go | 20 +-- audit/internal/app/atomic_write.go | 48 ++++++ audit/internal/app/atomic_write_test.go | 71 +++++++++ audit/internal/app/support_bundle.go | 2 + audit/internal/webui/api.go | 5 + audit/internal/webui/jobs.go | 4 +- audit/internal/webui/kmsg_watcher.go | 61 ++++---- audit/internal/webui/server.go | 51 ++++++- audit/internal/webui/server_test.go | 17 +++ audit/internal/webui/stability.go | 42 ++++++ audit/internal/webui/tasks.go | 141 ++++++++++-------- .../hooks/normal/9000-bee-setup.hook.chroot | 2 + iso/builder/smoketest.sh | 6 + .../etc/systemd/system/bee-selfheal.service | 9 ++ .../etc/systemd/system/bee-selfheal.timer | 11 ++ .../etc/systemd/system/bee-web.service | 3 +- iso/overlay/usr/local/bin/bee-selfheal | 99 ++++++++++++ 18 files changed, 495 insertions(+), 111 deletions(-) create mode 100644 audit/internal/app/atomic_write.go create mode 100644 audit/internal/app/atomic_write_test.go create mode 100644 audit/internal/webui/stability.go create mode 100644 iso/overlay/etc/systemd/system/bee-selfheal.service create mode 100644 iso/overlay/etc/systemd/system/bee-selfheal.timer create mode 100644 iso/overlay/usr/local/bin/bee-selfheal diff --git a/audit/cmd/bee/main.go b/audit/cmd/bee/main.go index f6b3856..5fb6c1e 100644 --- a/audit/cmd/bee/main.go +++ b/audit/cmd/bee/main.go @@ -7,6 +7,7 @@ import ( "io" "log/slog" "os" + "runtime/debug" "strings" "bee/audit/internal/app" @@ -29,10 +30,21 @@ func main() { os.Exit(run(os.Args[1:], os.Stdout, os.Stderr)) } -func run(args []string, stdout, stderr io.Writer) int { +func run(args []string, stdout, stderr io.Writer) (exitCode int) { slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ Level: slog.LevelInfo, }))) + defer func() { + rec := recover() + if rec == nil { + return + } + slog.Error("fatal panic", + "panic", fmt.Sprint(rec), + "stack", string(debug.Stack()), + ) + exitCode = 1 + }() if len(args) == 0 { printRootUsage(stderr) diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 11f6c73..1a18863 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -195,13 +195,11 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro return "stdout", err case strings.HasPrefix(output, "file:"): path := strings.TrimPrefix(output, "file:") - if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { - return "", err + err := atomicWriteFile(path, append(data, '\n'), 0644) + if err == nil { + return path, nil } - if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil { - return "", err - } - return path, nil + return "", err default: return "", fmt.Errorf("unknown output destination %q — use stdout or file:", output) } @@ -223,13 +221,11 @@ func (a *App) RunRuntimePreflight(output string) (string, error) { return "stdout", err case strings.HasPrefix(output, "file:"): path := strings.TrimPrefix(output, "file:") - if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { - return "", err + err := atomicWriteFile(path, append(data, '\n'), 0644) + if err == nil { + return path, nil } - if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil { - return "", err - } - return path, nil + return "", err default: return "", fmt.Errorf("unknown output destination %q — use stdout or file:", output) } diff --git a/audit/internal/app/atomic_write.go b/audit/internal/app/atomic_write.go new file mode 100644 index 0000000..a8b6edc --- /dev/null +++ b/audit/internal/app/atomic_write.go @@ -0,0 +1,48 @@ +package app + +import ( + "fmt" + "os" + "path/filepath" +) + +func atomicWriteFile(path string, data []byte, perm os.FileMode) error { + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err) + } + + tmpPath := path + ".tmp" + f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm) + if err != nil { + return fmt.Errorf("open temp %s: %w", tmpPath, err) + } + + success := false + defer func() { + _ = f.Close() + if !success { + _ = os.Remove(tmpPath) + } + }() + + if _, err := f.Write(data); err != nil { + return fmt.Errorf("write temp %s: %w", tmpPath, err) + } + if err := f.Sync(); err != nil { + return fmt.Errorf("sync temp %s: %w", tmpPath, err) + } + if err := f.Close(); err != nil { + return fmt.Errorf("close temp %s: %w", tmpPath, err) + } + if err := os.Rename(tmpPath, path); err != nil { + return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err) + } + + if dir, err := os.Open(filepath.Dir(path)); err == nil { + _ = dir.Sync() + _ = dir.Close() + } + + success = true + return nil +} diff --git a/audit/internal/app/atomic_write_test.go b/audit/internal/app/atomic_write_test.go new file mode 100644 index 0000000..9a71287 --- /dev/null +++ b/audit/internal/app/atomic_write_test.go @@ -0,0 +1,71 @@ +package app + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "bee/audit/internal/schema" +) + +func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) { + path := filepath.Join(t.TempDir(), "bee-audit.json") + if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil { + t.Fatalf("seed file: %v", err) + } + + if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil { + t.Fatalf("atomicWriteFile: %v", err) + } + + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read final: %v", err) + } + if string(raw) != "new\n" { + t.Fatalf("final content=%q want %q", string(raw), "new\n") + } + if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) { + t.Fatalf("tmp file should be absent after success, err=%v", err) + } +} + +func TestRunRuntimePreflightWritesAtomically(t *testing.T) { + path := filepath.Join(t.TempDir(), "runtime-health.json") + a := &App{ + runtime: fakeRuntime{ + collectFn: func(exportDir string) (schema.RuntimeHealth, error) { + return schema.RuntimeHealth{ + Status: "OK", + ExportDir: exportDir, + DriverReady: true, + CUDAReady: true, + }, nil + }, + }, + } + + got, err := a.RunRuntimePreflight("file:" + path) + if err != nil { + t.Fatalf("RunRuntimePreflight: %v", err) + } + if got != path { + t.Fatalf("path=%q want %q", got, path) + } + if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) { + t.Fatalf("tmp file should be absent after success, err=%v", err) + } + + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read runtime file: %v", err) + } + var health schema.RuntimeHealth + if err := json.Unmarshal(raw, &health); err != nil { + t.Fatalf("json unmarshal: %v", err) + } + if health.Status != "OK" { + t.Fatalf("status=%q want OK", health.Status) + } +} diff --git a/audit/internal/app/support_bundle.go b/audit/internal/app/support_bundle.go index 57d7d36..766d643 100644 --- a/audit/internal/app/support_bundle.go +++ b/audit/internal/app/support_bundle.go @@ -19,6 +19,8 @@ var supportBundleServices = []string{ "bee-network.service", "bee-nvidia.service", "bee-preflight.service", + "bee-selfheal.service", + "bee-selfheal.timer", "bee-sshsetup.service", } diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index f2f8bb1..c88cb54 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -110,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error { scanDone := make(chan error, 1) go func() { + defer func() { + if rec := recover(); rec != nil { + scanDone <- fmt.Errorf("stream scanner panic: %v", rec) + } + }() scanner := bufio.NewScanner(pr) scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) for scanner.Scan() { diff --git a/audit/internal/webui/jobs.go b/audit/internal/webui/jobs.go index a255529..1e08560 100644 --- a/audit/internal/webui/jobs.go +++ b/audit/internal/webui/jobs.go @@ -84,12 +84,12 @@ func (m *jobManager) create(id string) *jobState { j := &jobState{} m.jobs[id] = j // Schedule cleanup after 30 minutes - go func() { + goRecoverOnce("job cleanup", func() { time.Sleep(30 * time.Minute) m.mu.Lock() delete(m.jobs, id) m.mu.Unlock() - }() + }) return j } diff --git a/audit/internal/webui/kmsg_watcher.go b/audit/internal/webui/kmsg_watcher.go index 0672ddd..ad413a2 100644 --- a/audit/internal/webui/kmsg_watcher.go +++ b/audit/internal/webui/kmsg_watcher.go @@ -17,10 +17,10 @@ import ( // It supports multiple concurrent SAT tasks: a shared event window is open // while any SAT task is running, and flushed when all tasks complete. type kmsgWatcher struct { - mu sync.Mutex - activeCount int // number of in-flight SAT tasks - window *kmsgWindow - statusDB *app.ComponentStatusDB + mu sync.Mutex + activeCount int // number of in-flight SAT tasks + window *kmsgWindow + statusDB *app.ComponentStatusDB } type kmsgWindow struct { @@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher { // start launches the background kmsg reading goroutine. func (w *kmsgWatcher) start() { - go w.run() + goRecoverLoop("kmsg watcher", 5*time.Second, w.run) } func (w *kmsgWatcher) run() { - f, err := os.Open("/dev/kmsg") - if err != nil { - slog.Warn("kmsg watcher unavailable", "err", err) - return - } - defer f.Close() - - // Best-effort seek to end so we only capture events from now forward. - _, _ = f.Seek(0, io.SeekEnd) - - scanner := bufio.NewScanner(f) - scanner.Buffer(make([]byte, 64*1024), 64*1024) - for scanner.Scan() { - line := scanner.Text() - evt, ok := parseKmsgLine(line) - if !ok { + for { + f, err := os.Open("/dev/kmsg") + if err != nil { + slog.Warn("kmsg watcher unavailable", "err", err) + time.Sleep(30 * time.Second) continue } - w.mu.Lock() - if w.window != nil { - w.recordEvent(evt) + // Best-effort seek to end so we only capture events from now forward. + _, _ = f.Seek(0, io.SeekEnd) + + scanner := bufio.NewScanner(f) + scanner.Buffer(make([]byte, 64*1024), 64*1024) + for scanner.Scan() { + line := scanner.Text() + evt, ok := parseKmsgLine(line) + if !ok { + continue + } + w.mu.Lock() + if w.window != nil { + w.recordEvent(evt) + } + w.mu.Unlock() } - w.mu.Unlock() - } - if err := scanner.Err(); err != nil { - slog.Warn("kmsg watcher stopped", "err", err) + if err := scanner.Err(); err != nil { + slog.Warn("kmsg watcher stopped", "err", err) + } + _ = f.Close() + time.Sleep(2 * time.Second) } } @@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) { if window == nil || len(window.events) == 0 { return } - go w.flushWindow(window) + goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) }) } func (w *kmsgWatcher) flushWindow(window *kmsgWindow) { diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index ab85603..e4165af 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -11,6 +11,7 @@ import ( "net/http" "os" "path/filepath" + "runtime/debug" "sort" "strconv" "strings" @@ -311,11 +312,11 @@ func NewHandler(opts HandlerOptions) http.Handler { mux.HandleFunc("GET /", h.handlePage) h.mux = mux - return mux + return recoverMiddleware(mux) } func (h *handler) startMetricsCollector() { - go func() { + goRecoverLoop("metrics collector", 2*time.Second, func() { ticker := time.NewTicker(metricsCollectInterval) defer ticker.Stop() for range ticker.C { @@ -326,7 +327,7 @@ func (h *handler) startMetricsCollector() { h.feedRings(sample) h.setLatestMetric(sample) } - }() + }) } func (h *handler) setLatestMetric(sample platform.LiveMetricSample) { @@ -347,7 +348,49 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) { // ListenAndServe starts the HTTP server. func ListenAndServe(addr string, opts HandlerOptions) error { - return http.ListenAndServe(addr, NewHandler(opts)) + srv := &http.Server{ + Addr: addr, + Handler: NewHandler(opts), + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: 30 * time.Second, + IdleTimeout: 2 * time.Minute, + } + return srv.ListenAndServe() +} + +type trackingResponseWriter struct { + http.ResponseWriter + wroteHeader bool +} + +func (w *trackingResponseWriter) WriteHeader(statusCode int) { + w.wroteHeader = true + w.ResponseWriter.WriteHeader(statusCode) +} + +func (w *trackingResponseWriter) Write(p []byte) (int, error) { + w.wroteHeader = true + return w.ResponseWriter.Write(p) +} + +func recoverMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + tw := &trackingResponseWriter{ResponseWriter: w} + defer func() { + if rec := recover(); rec != nil { + slog.Error("http handler panic", + "method", r.Method, + "path", r.URL.Path, + "panic", fmt.Sprint(rec), + "stack", string(debug.Stack()), + ) + if !tw.wroteHeader { + http.Error(tw, "internal server error", http.StatusInternalServerError) + } + } + }() + next.ServeHTTP(tw, r) + }) } // ── Infrastructure handlers ────────────────────────────────────────────────── diff --git a/audit/internal/webui/server_test.go b/audit/internal/webui/server_test.go index c67954f..abe1ea5 100644 --- a/audit/internal/webui/server_test.go +++ b/audit/internal/webui/server_test.go @@ -34,6 +34,23 @@ func TestChartLegendNumber(t *testing.T) { } } +func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) { + handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + panic("boom") + })) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/panic", nil) + + handler.ServeHTTP(rec, req) + + if rec.Code != http.StatusInternalServerError { + t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError) + } + if !strings.Contains(rec.Body.String(), "internal server error") { + t.Fatalf("body=%q", rec.Body.String()) + } +} + func TestChartDataFromSamplesUsesFullHistory(t *testing.T) { samples := []platform.LiveMetricSample{ { diff --git a/audit/internal/webui/stability.go b/audit/internal/webui/stability.go new file mode 100644 index 0000000..8ece151 --- /dev/null +++ b/audit/internal/webui/stability.go @@ -0,0 +1,42 @@ +package webui + +import ( + "fmt" + "log/slog" + "runtime/debug" + "time" +) + +func goRecoverLoop(name string, restartDelay time.Duration, fn func()) { + go func() { + for { + if !runRecoverable(name, fn) { + return + } + if restartDelay > 0 { + time.Sleep(restartDelay) + } + } + }() +} + +func goRecoverOnce(name string, fn func()) { + go func() { + _ = runRecoverable(name, fn) + }() +} + +func runRecoverable(name string, fn func()) (panicked bool) { + defer func() { + if rec := recover(); rec != nil { + panicked = true + slog.Error("recovered panic", + "component", name, + "panic", fmt.Sprint(rec), + "stack", string(debug.Stack()), + ) + } + }() + fn() + return false +} diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index 039b324..743a20b 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -4,10 +4,12 @@ import ( "context" "encoding/json" "fmt" + "log/slog" "net/http" "os" "os/exec" "path/filepath" + "runtime/debug" "sort" "strings" "sync" @@ -377,7 +379,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) { if !q.started { q.loadLocked() q.started = true - go q.worker() + goRecoverLoop("task worker", 2*time.Second, q.worker) } hasPending := q.nextPending() != nil q.mu.Unlock() @@ -392,75 +394,90 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) { func (q *taskQueue) worker() { for { <-q.trigger - setCPUGovernor("performance") + func() { + setCPUGovernor("performance") + defer setCPUGovernor("powersave") - // Drain all pending tasks and start them in parallel. - q.mu.Lock() - var batch []*Task - for { - t := q.nextPending() - if t == nil { - break + // Drain all pending tasks and start them in parallel. + q.mu.Lock() + var batch []*Task + for { + t := q.nextPending() + if t == nil { + break + } + now := time.Now() + t.Status = TaskRunning + t.StartedAt = &now + t.DoneAt = nil + t.ErrMsg = "" + j := newTaskJobState(t.LogPath) + t.job = j + batch = append(batch, t) } - now := time.Now() - t.Status = TaskRunning - t.StartedAt = &now - t.DoneAt = nil - t.ErrMsg = "" - j := newTaskJobState(t.LogPath) - t.job = j - batch = append(batch, t) - } - if len(batch) > 0 { - q.persistLocked() - } - q.mu.Unlock() + if len(batch) > 0 { + q.persistLocked() + } + q.mu.Unlock() - var wg sync.WaitGroup - for _, t := range batch { - t := t - j := t.job - taskCtx, taskCancel := context.WithCancel(context.Background()) - j.cancel = taskCancel - wg.Add(1) - go func() { - defer wg.Done() + var wg sync.WaitGroup + for _, t := range batch { + t := t + j := t.job + taskCtx, taskCancel := context.WithCancel(context.Background()) + j.cancel = taskCancel + wg.Add(1) + goRecoverOnce("task "+t.Target, func() { + defer wg.Done() + defer func() { + if rec := recover(); rec != nil { + msg := fmt.Sprintf("task panic: %v", rec) + slog.Error("task panic", + "task_id", t.ID, + "target", t.Target, + "panic", fmt.Sprint(rec), + "stack", string(debug.Stack()), + ) + j.append("ERROR: " + msg) + j.finish(msg) + } + }() - if q.kmsgWatcher != nil && isSATTarget(t.Target) { - q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target) - } - - q.runTask(t, j, taskCtx) - - if q.kmsgWatcher != nil { - q.kmsgWatcher.NotifyTaskFinished(t.ID) - } - - q.mu.Lock() - now2 := time.Now() - t.DoneAt = &now2 - if t.Status == TaskRunning { - if j.err != "" { - t.Status = TaskFailed - t.ErrMsg = j.err - } else { - t.Status = TaskDone + if q.kmsgWatcher != nil && isSATTarget(t.Target) { + q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target) } - } + + q.runTask(t, j, taskCtx) + + if q.kmsgWatcher != nil { + q.kmsgWatcher.NotifyTaskFinished(t.ID) + } + + q.mu.Lock() + now2 := time.Now() + t.DoneAt = &now2 + if t.Status == TaskRunning { + if j.err != "" { + t.Status = TaskFailed + t.ErrMsg = j.err + } else { + t.Status = TaskDone + } + } + q.persistLocked() + q.mu.Unlock() + }) + } + wg.Wait() + + if len(batch) > 0 { + q.mu.Lock() + q.prune() q.persistLocked() q.mu.Unlock() - }() - } - wg.Wait() + } + }() - if len(batch) > 0 { - q.mu.Lock() - q.prune() - q.persistLocked() - q.mu.Unlock() - } - - setCPUGovernor("powersave") } } diff --git a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot index 877bff9..fc5fbca 100755 --- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot +++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot @@ -30,6 +30,7 @@ systemctl enable bee-preflight.service systemctl enable bee-audit.service systemctl enable bee-web.service systemctl enable bee-sshsetup.service +systemctl enable bee-selfheal.timer systemctl enable ssh.service systemctl enable lightdm.service 2>/dev/null || true systemctl enable qemu-guest-agent.service 2>/dev/null || true @@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true chmod +x /usr/local/bin/bee 2>/dev/null || true chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true +chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true if [ "$GPU_VENDOR" = "nvidia" ]; then chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true diff --git a/iso/builder/smoketest.sh b/iso/builder/smoketest.sh index 025249b..b3cd3a0 100644 --- a/iso/builder/smoketest.sh +++ b/iso/builder/smoketest.sh @@ -171,6 +171,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do fi done +if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then + ok "timer active: bee-selfheal.timer" +else + fail "timer NOT active: bee-selfheal.timer" +fi + echo "" echo "-- runtime health --" if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then diff --git a/iso/overlay/etc/systemd/system/bee-selfheal.service b/iso/overlay/etc/systemd/system/bee-selfheal.service new file mode 100644 index 0000000..451d22f --- /dev/null +++ b/iso/overlay/etc/systemd/system/bee-selfheal.service @@ -0,0 +1,9 @@ +[Unit] +Description=Bee: periodic runtime self-heal +After=bee-web.service bee-audit.service bee-preflight.service + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal +StandardOutput=journal +StandardError=journal diff --git a/iso/overlay/etc/systemd/system/bee-selfheal.timer b/iso/overlay/etc/systemd/system/bee-selfheal.timer new file mode 100644 index 0000000..c96a238 --- /dev/null +++ b/iso/overlay/etc/systemd/system/bee-selfheal.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Bee: run self-heal checks periodically + +[Timer] +OnBootSec=45sec +OnUnitActiveSec=60sec +AccuracySec=15sec +Unit=bee-selfheal.service + +[Install] +WantedBy=timers.target diff --git a/iso/overlay/etc/systemd/system/bee-web.service b/iso/overlay/etc/systemd/system/bee-web.service index f07595d..0fd7b9a 100644 --- a/iso/overlay/etc/systemd/system/bee-web.service +++ b/iso/overlay/etc/systemd/system/bee-web.service @@ -1,11 +1,12 @@ [Unit] Description=Bee: hardware audit web viewer +StartLimitIntervalSec=0 [Service] Type=simple ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit" Restart=always -RestartSec=2 +RestartSec=3 StandardOutput=journal StandardError=journal LimitMEMLOCK=infinity diff --git a/iso/overlay/usr/local/bin/bee-selfheal b/iso/overlay/usr/local/bin/bee-selfheal new file mode 100644 index 0000000..f52c90c --- /dev/null +++ b/iso/overlay/usr/local/bin/bee-selfheal @@ -0,0 +1,99 @@ +#!/bin/bash +# bee-selfheal — periodic best-effort recovery for critical live ISO services. + +set -u + +LOG_PREFIX="bee-selfheal" +EXPORT_DIR="/appdata/bee/export" +AUDIT_JSON="${EXPORT_DIR}/bee-audit.json" +RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json" +LOCK_DIR="/run/bee-selfheal.lock" + +log() { + echo "[${LOG_PREFIX}] $*" +} + +have_nvidia_gpu() { + lspci -nn 2>/dev/null | grep -qi '10de:' +} + +service_active() { + systemctl is-active --quiet "$1" 2>/dev/null +} + +restart_service() { + local svc="$1" + if systemctl restart "$svc" >/dev/null 2>&1; then + log "restarted ${svc}" + return 0 + fi + log "WARN: failed to restart ${svc}" + return 1 +} + +file_ready() { + [ -s "$1" ] +} + +artifact_state() { + local path="$1" + if [ -s "${path}" ]; then + echo "ready" + return 0 + fi + if [ -e "${path}.tmp" ]; then + echo "interrupted" + return 0 + fi + echo "missing" +} + +web_healthy() { + bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \ + >/dev/null 2>&1 +} + +mkdir -p "${EXPORT_DIR}" /run + +if ! mkdir "${LOCK_DIR}" 2>/dev/null; then + log "another self-heal run is already active" + exit 0 +fi +trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT + +log "start" + +if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then + log "NVIDIA GPU detected but /dev/nvidia0 is missing" + restart_service bee-nvidia.service || true +fi + +runtime_state="$(artifact_state "${RUNTIME_JSON}")" +if [ "${runtime_state}" != "ready" ]; then + if [ "${runtime_state}" = "interrupted" ]; then + log "runtime-health.json.tmp exists — interrupted runtime-health write detected" + else + log "runtime-health.json missing or empty" + fi + restart_service bee-preflight.service || true +fi + +audit_state="$(artifact_state "${AUDIT_JSON}")" +if [ "${audit_state}" != "ready" ]; then + if [ "${audit_state}" = "interrupted" ]; then + log "bee-audit.json.tmp exists — interrupted audit write detected" + else + log "bee-audit.json missing or empty" + fi + restart_service bee-audit.service || true +fi + +if ! service_active bee-web.service; then + log "bee-web.service is not active" + restart_service bee-web.service || true +elif ! web_healthy; then + log "bee-web health check failed" + restart_service bee-web.service || true +fi + +log "done"