Add stability hardening and self-heal recovery

This commit is contained in:
2026-04-05 10:29:37 +03:00
parent 9826d437a5
commit 143b7dca5d
18 changed files with 495 additions and 111 deletions

View File

@@ -7,6 +7,7 @@ import (
"io"
"log/slog"
"os"
"runtime/debug"
"strings"
"bee/audit/internal/app"
@@ -29,10 +30,21 @@ func main() {
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
}
func run(args []string, stdout, stderr io.Writer) int {
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo,
})))
defer func() {
rec := recover()
if rec == nil {
return
}
slog.Error("fatal panic",
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
exitCode = 1
}()
if len(args) == 0 {
printRootUsage(stderr)

View File

@@ -195,13 +195,11 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
return "stdout", err
case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return "", err
err := atomicWriteFile(path, append(data, '\n'), 0644)
if err == nil {
return path, nil
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
return path, nil
return "", err
default:
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
}
@@ -223,13 +221,11 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
return "stdout", err
case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return "", err
err := atomicWriteFile(path, append(data, '\n'), 0644)
if err == nil {
return path, nil
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
return path, nil
return "", err
default:
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
}

View File

@@ -0,0 +1,48 @@
package app
import (
"fmt"
"os"
"path/filepath"
)
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
}
tmpPath := path + ".tmp"
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
if err != nil {
return fmt.Errorf("open temp %s: %w", tmpPath, err)
}
success := false
defer func() {
_ = f.Close()
if !success {
_ = os.Remove(tmpPath)
}
}()
if _, err := f.Write(data); err != nil {
return fmt.Errorf("write temp %s: %w", tmpPath, err)
}
if err := f.Sync(); err != nil {
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
}
if err := f.Close(); err != nil {
return fmt.Errorf("close temp %s: %w", tmpPath, err)
}
if err := os.Rename(tmpPath, path); err != nil {
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
}
if dir, err := os.Open(filepath.Dir(path)); err == nil {
_ = dir.Sync()
_ = dir.Close()
}
success = true
return nil
}

View File

@@ -0,0 +1,71 @@
package app
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
path := filepath.Join(t.TempDir(), "bee-audit.json")
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
t.Fatalf("seed file: %v", err)
}
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
t.Fatalf("atomicWriteFile: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read final: %v", err)
}
if string(raw) != "new\n" {
t.Fatalf("final content=%q want %q", string(raw), "new\n")
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
}
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
path := filepath.Join(t.TempDir(), "runtime-health.json")
a := &App{
runtime: fakeRuntime{
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
return schema.RuntimeHealth{
Status: "OK",
ExportDir: exportDir,
DriverReady: true,
CUDAReady: true,
}, nil
},
},
}
got, err := a.RunRuntimePreflight("file:" + path)
if err != nil {
t.Fatalf("RunRuntimePreflight: %v", err)
}
if got != path {
t.Fatalf("path=%q want %q", got, path)
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read runtime file: %v", err)
}
var health schema.RuntimeHealth
if err := json.Unmarshal(raw, &health); err != nil {
t.Fatalf("json unmarshal: %v", err)
}
if health.Status != "OK" {
t.Fatalf("status=%q want OK", health.Status)
}
}

View File

@@ -19,6 +19,8 @@ var supportBundleServices = []string{
"bee-network.service",
"bee-nvidia.service",
"bee-preflight.service",
"bee-selfheal.service",
"bee-selfheal.timer",
"bee-sshsetup.service",
}

View File

@@ -110,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
scanDone := make(chan error, 1)
go func() {
defer func() {
if rec := recover(); rec != nil {
scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
}
}()
scanner := bufio.NewScanner(pr)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() {

View File

@@ -84,12 +84,12 @@ func (m *jobManager) create(id string) *jobState {
j := &jobState{}
m.jobs[id] = j
// Schedule cleanup after 30 minutes
go func() {
goRecoverOnce("job cleanup", func() {
time.Sleep(30 * time.Minute)
m.mu.Lock()
delete(m.jobs, id)
m.mu.Unlock()
}()
})
return j
}

View File

@@ -17,10 +17,10 @@ import (
// It supports multiple concurrent SAT tasks: a shared event window is open
// while any SAT task is running, and flushed when all tasks complete.
type kmsgWatcher struct {
mu sync.Mutex
activeCount int // number of in-flight SAT tasks
window *kmsgWindow
statusDB *app.ComponentStatusDB
mu sync.Mutex
activeCount int // number of in-flight SAT tasks
window *kmsgWindow
statusDB *app.ComponentStatusDB
}
type kmsgWindow struct {
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
// start launches the background kmsg reading goroutine.
func (w *kmsgWatcher) start() {
go w.run()
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
}
func (w *kmsgWatcher) run() {
f, err := os.Open("/dev/kmsg")
if err != nil {
slog.Warn("kmsg watcher unavailable", "err", err)
return
}
defer f.Close()
// Best-effort seek to end so we only capture events from now forward.
_, _ = f.Seek(0, io.SeekEnd)
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 64*1024), 64*1024)
for scanner.Scan() {
line := scanner.Text()
evt, ok := parseKmsgLine(line)
if !ok {
for {
f, err := os.Open("/dev/kmsg")
if err != nil {
slog.Warn("kmsg watcher unavailable", "err", err)
time.Sleep(30 * time.Second)
continue
}
w.mu.Lock()
if w.window != nil {
w.recordEvent(evt)
// Best-effort seek to end so we only capture events from now forward.
_, _ = f.Seek(0, io.SeekEnd)
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 64*1024), 64*1024)
for scanner.Scan() {
line := scanner.Text()
evt, ok := parseKmsgLine(line)
if !ok {
continue
}
w.mu.Lock()
if w.window != nil {
w.recordEvent(evt)
}
w.mu.Unlock()
}
w.mu.Unlock()
}
if err := scanner.Err(); err != nil {
slog.Warn("kmsg watcher stopped", "err", err)
if err := scanner.Err(); err != nil {
slog.Warn("kmsg watcher stopped", "err", err)
}
_ = f.Close()
time.Sleep(2 * time.Second)
}
}
@@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
if window == nil || len(window.events) == 0 {
return
}
go w.flushWindow(window)
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
}
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {

View File

@@ -11,6 +11,7 @@ import (
"net/http"
"os"
"path/filepath"
"runtime/debug"
"sort"
"strconv"
"strings"
@@ -311,11 +312,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("GET /", h.handlePage)
h.mux = mux
return mux
return recoverMiddleware(mux)
}
func (h *handler) startMetricsCollector() {
go func() {
goRecoverLoop("metrics collector", 2*time.Second, func() {
ticker := time.NewTicker(metricsCollectInterval)
defer ticker.Stop()
for range ticker.C {
@@ -326,7 +327,7 @@ func (h *handler) startMetricsCollector() {
h.feedRings(sample)
h.setLatestMetric(sample)
}
}()
})
}
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
@@ -347,7 +348,49 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
// ListenAndServe starts the HTTP server.
func ListenAndServe(addr string, opts HandlerOptions) error {
return http.ListenAndServe(addr, NewHandler(opts))
srv := &http.Server{
Addr: addr,
Handler: NewHandler(opts),
ReadHeaderTimeout: 5 * time.Second,
ReadTimeout: 30 * time.Second,
IdleTimeout: 2 * time.Minute,
}
return srv.ListenAndServe()
}
type trackingResponseWriter struct {
http.ResponseWriter
wroteHeader bool
}
func (w *trackingResponseWriter) WriteHeader(statusCode int) {
w.wroteHeader = true
w.ResponseWriter.WriteHeader(statusCode)
}
func (w *trackingResponseWriter) Write(p []byte) (int, error) {
w.wroteHeader = true
return w.ResponseWriter.Write(p)
}
func recoverMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tw := &trackingResponseWriter{ResponseWriter: w}
defer func() {
if rec := recover(); rec != nil {
slog.Error("http handler panic",
"method", r.Method,
"path", r.URL.Path,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
if !tw.wroteHeader {
http.Error(tw, "internal server error", http.StatusInternalServerError)
}
}
}()
next.ServeHTTP(tw, r)
})
}
// ── Infrastructure handlers ──────────────────────────────────────────────────

View File

@@ -34,6 +34,23 @@ func TestChartLegendNumber(t *testing.T) {
}
}
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
panic("boom")
}))
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
handler.ServeHTTP(rec, req)
if rec.Code != http.StatusInternalServerError {
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
}
if !strings.Contains(rec.Body.String(), "internal server error") {
t.Fatalf("body=%q", rec.Body.String())
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{
{

View File

@@ -0,0 +1,42 @@
package webui
import (
"fmt"
"log/slog"
"runtime/debug"
"time"
)
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
go func() {
for {
if !runRecoverable(name, fn) {
return
}
if restartDelay > 0 {
time.Sleep(restartDelay)
}
}
}()
}
func goRecoverOnce(name string, fn func()) {
go func() {
_ = runRecoverable(name, fn)
}()
}
func runRecoverable(name string, fn func()) (panicked bool) {
defer func() {
if rec := recover(); rec != nil {
panicked = true
slog.Error("recovered panic",
"component", name,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
}
}()
fn()
return false
}

View File

@@ -4,10 +4,12 @@ import (
"context"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime/debug"
"sort"
"strings"
"sync"
@@ -377,7 +379,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
if !q.started {
q.loadLocked()
q.started = true
go q.worker()
goRecoverLoop("task worker", 2*time.Second, q.worker)
}
hasPending := q.nextPending() != nil
q.mu.Unlock()
@@ -392,75 +394,90 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
func (q *taskQueue) worker() {
for {
<-q.trigger
setCPUGovernor("performance")
func() {
setCPUGovernor("performance")
defer setCPUGovernor("powersave")
// Drain all pending tasks and start them in parallel.
q.mu.Lock()
var batch []*Task
for {
t := q.nextPending()
if t == nil {
break
// Drain all pending tasks and start them in parallel.
q.mu.Lock()
var batch []*Task
for {
t := q.nextPending()
if t == nil {
break
}
now := time.Now()
t.Status = TaskRunning
t.StartedAt = &now
t.DoneAt = nil
t.ErrMsg = ""
j := newTaskJobState(t.LogPath)
t.job = j
batch = append(batch, t)
}
now := time.Now()
t.Status = TaskRunning
t.StartedAt = &now
t.DoneAt = nil
t.ErrMsg = ""
j := newTaskJobState(t.LogPath)
t.job = j
batch = append(batch, t)
}
if len(batch) > 0 {
q.persistLocked()
}
q.mu.Unlock()
if len(batch) > 0 {
q.persistLocked()
}
q.mu.Unlock()
var wg sync.WaitGroup
for _, t := range batch {
t := t
j := t.job
taskCtx, taskCancel := context.WithCancel(context.Background())
j.cancel = taskCancel
wg.Add(1)
go func() {
defer wg.Done()
var wg sync.WaitGroup
for _, t := range batch {
t := t
j := t.job
taskCtx, taskCancel := context.WithCancel(context.Background())
j.cancel = taskCancel
wg.Add(1)
goRecoverOnce("task "+t.Target, func() {
defer wg.Done()
defer func() {
if rec := recover(); rec != nil {
msg := fmt.Sprintf("task panic: %v", rec)
slog.Error("task panic",
"task_id", t.ID,
"target", t.Target,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
j.append("ERROR: " + msg)
j.finish(msg)
}
}()
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
}
q.runTask(t, j, taskCtx)
if q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
q.mu.Lock()
now2 := time.Now()
t.DoneAt = &now2
if t.Status == TaskRunning {
if j.err != "" {
t.Status = TaskFailed
t.ErrMsg = j.err
} else {
t.Status = TaskDone
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
}
}
q.runTask(t, j, taskCtx)
if q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
q.mu.Lock()
now2 := time.Now()
t.DoneAt = &now2
if t.Status == TaskRunning {
if j.err != "" {
t.Status = TaskFailed
t.ErrMsg = j.err
} else {
t.Status = TaskDone
}
}
q.persistLocked()
q.mu.Unlock()
})
}
wg.Wait()
if len(batch) > 0 {
q.mu.Lock()
q.prune()
q.persistLocked()
q.mu.Unlock()
}()
}
wg.Wait()
}
}()
if len(batch) > 0 {
q.mu.Lock()
q.prune()
q.persistLocked()
q.mu.Unlock()
}
setCPUGovernor("powersave")
}
}