Add stability hardening and self-heal recovery
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"runtime/debug"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -29,10 +30,21 @@ func main() {
|
|||||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||||
}
|
}
|
||||||
|
|
||||||
func run(args []string, stdout, stderr io.Writer) int {
|
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||||
Level: slog.LevelInfo,
|
Level: slog.LevelInfo,
|
||||||
})))
|
})))
|
||||||
|
defer func() {
|
||||||
|
rec := recover()
|
||||||
|
if rec == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
slog.Error("fatal panic",
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
exitCode = 1
|
||||||
|
}()
|
||||||
|
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
printRootUsage(stderr)
|
printRootUsage(stderr)
|
||||||
|
|||||||
@@ -195,13 +195,11 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
err := atomicWriteFile(path, append(data, '\n'), 0644)
|
||||||
return "", err
|
if err == nil {
|
||||||
|
return path, nil
|
||||||
}
|
}
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
return "", err
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return path, nil
|
|
||||||
default:
|
default:
|
||||||
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
||||||
}
|
}
|
||||||
@@ -223,13 +221,11 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
err := atomicWriteFile(path, append(data, '\n'), 0644)
|
||||||
return "", err
|
if err == nil {
|
||||||
|
return path, nil
|
||||||
}
|
}
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
return "", err
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return path, nil
|
|
||||||
default:
|
default:
|
||||||
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
||||||
}
|
}
|
||||||
|
|||||||
48
audit/internal/app/atomic_write.go
Normal file
48
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
)
|
||||||
|
|
||||||
|
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tmpPath := path + ".tmp"
|
||||||
|
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
success := false
|
||||||
|
defer func() {
|
||||||
|
_ = f.Close()
|
||||||
|
if !success {
|
||||||
|
_ = os.Remove(tmpPath)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if _, err := f.Write(data); err != nil {
|
||||||
|
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmpPath, path); err != nil {
|
||||||
|
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||||
|
_ = dir.Sync()
|
||||||
|
_ = dir.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
success = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("seed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("atomicWriteFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read final: %v", err)
|
||||||
|
}
|
||||||
|
if string(raw) != "new\n" {
|
||||||
|
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||||
|
a := &App{
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
return schema.RuntimeHealth{
|
||||||
|
Status: "OK",
|
||||||
|
ExportDir: exportDir,
|
||||||
|
DriverReady: true,
|
||||||
|
CUDAReady: true,
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := a.RunRuntimePreflight("file:" + path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||||
|
}
|
||||||
|
if got != path {
|
||||||
|
t.Fatalf("path=%q want %q", got, path)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read runtime file: %v", err)
|
||||||
|
}
|
||||||
|
var health schema.RuntimeHealth
|
||||||
|
if err := json.Unmarshal(raw, &health); err != nil {
|
||||||
|
t.Fatalf("json unmarshal: %v", err)
|
||||||
|
}
|
||||||
|
if health.Status != "OK" {
|
||||||
|
t.Fatalf("status=%q want OK", health.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-network.service",
|
"bee-network.service",
|
||||||
"bee-nvidia.service",
|
"bee-nvidia.service",
|
||||||
"bee-preflight.service",
|
"bee-preflight.service",
|
||||||
|
"bee-selfheal.service",
|
||||||
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -110,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
|
|||||||
|
|
||||||
scanDone := make(chan error, 1)
|
scanDone := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
|
||||||
|
}
|
||||||
|
}()
|
||||||
scanner := bufio.NewScanner(pr)
|
scanner := bufio.NewScanner(pr)
|
||||||
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
|
|||||||
@@ -84,12 +84,12 @@ func (m *jobManager) create(id string) *jobState {
|
|||||||
j := &jobState{}
|
j := &jobState{}
|
||||||
m.jobs[id] = j
|
m.jobs[id] = j
|
||||||
// Schedule cleanup after 30 minutes
|
// Schedule cleanup after 30 minutes
|
||||||
go func() {
|
goRecoverOnce("job cleanup", func() {
|
||||||
time.Sleep(30 * time.Minute)
|
time.Sleep(30 * time.Minute)
|
||||||
m.mu.Lock()
|
m.mu.Lock()
|
||||||
delete(m.jobs, id)
|
delete(m.jobs, id)
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
}()
|
})
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -17,10 +17,10 @@ import (
|
|||||||
// It supports multiple concurrent SAT tasks: a shared event window is open
|
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||||
// while any SAT task is running, and flushed when all tasks complete.
|
// while any SAT task is running, and flushed when all tasks complete.
|
||||||
type kmsgWatcher struct {
|
type kmsgWatcher struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
activeCount int // number of in-flight SAT tasks
|
activeCount int // number of in-flight SAT tasks
|
||||||
window *kmsgWindow
|
window *kmsgWindow
|
||||||
statusDB *app.ComponentStatusDB
|
statusDB *app.ComponentStatusDB
|
||||||
}
|
}
|
||||||
|
|
||||||
type kmsgWindow struct {
|
type kmsgWindow struct {
|
||||||
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
|||||||
|
|
||||||
// start launches the background kmsg reading goroutine.
|
// start launches the background kmsg reading goroutine.
|
||||||
func (w *kmsgWatcher) start() {
|
func (w *kmsgWatcher) start() {
|
||||||
go w.run()
|
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *kmsgWatcher) run() {
|
func (w *kmsgWatcher) run() {
|
||||||
f, err := os.Open("/dev/kmsg")
|
for {
|
||||||
if err != nil {
|
f, err := os.Open("/dev/kmsg")
|
||||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
if err != nil {
|
||||||
return
|
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||||
}
|
time.Sleep(30 * time.Second)
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
// Best-effort seek to end so we only capture events from now forward.
|
|
||||||
_, _ = f.Seek(0, io.SeekEnd)
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(f)
|
|
||||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
|
||||||
for scanner.Scan() {
|
|
||||||
line := scanner.Text()
|
|
||||||
evt, ok := parseKmsgLine(line)
|
|
||||||
if !ok {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
w.mu.Lock()
|
// Best-effort seek to end so we only capture events from now forward.
|
||||||
if w.window != nil {
|
_, _ = f.Seek(0, io.SeekEnd)
|
||||||
w.recordEvent(evt)
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
evt, ok := parseKmsgLine(line)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.window != nil {
|
||||||
|
w.recordEvent(evt)
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
}
|
}
|
||||||
w.mu.Unlock()
|
if err := scanner.Err(); err != nil {
|
||||||
}
|
slog.Warn("kmsg watcher stopped", "err", err)
|
||||||
if err := scanner.Err(); err != nil {
|
}
|
||||||
slog.Warn("kmsg watcher stopped", "err", err)
|
_ = f.Close()
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
|||||||
if window == nil || len(window.events) == 0 {
|
if window == nil || len(window.events) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
go w.flushWindow(window)
|
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"runtime/debug"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -311,11 +312,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("GET /", h.handlePage)
|
mux.HandleFunc("GET /", h.handlePage)
|
||||||
|
|
||||||
h.mux = mux
|
h.mux = mux
|
||||||
return mux
|
return recoverMiddleware(mux)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) startMetricsCollector() {
|
func (h *handler) startMetricsCollector() {
|
||||||
go func() {
|
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||||
ticker := time.NewTicker(metricsCollectInterval)
|
ticker := time.NewTicker(metricsCollectInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
@@ -326,7 +327,7 @@ func (h *handler) startMetricsCollector() {
|
|||||||
h.feedRings(sample)
|
h.feedRings(sample)
|
||||||
h.setLatestMetric(sample)
|
h.setLatestMetric(sample)
|
||||||
}
|
}
|
||||||
}()
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||||
@@ -347,7 +348,49 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
|||||||
|
|
||||||
// ListenAndServe starts the HTTP server.
|
// ListenAndServe starts the HTTP server.
|
||||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||||
return http.ListenAndServe(addr, NewHandler(opts))
|
srv := &http.Server{
|
||||||
|
Addr: addr,
|
||||||
|
Handler: NewHandler(opts),
|
||||||
|
ReadHeaderTimeout: 5 * time.Second,
|
||||||
|
ReadTimeout: 30 * time.Second,
|
||||||
|
IdleTimeout: 2 * time.Minute,
|
||||||
|
}
|
||||||
|
return srv.ListenAndServe()
|
||||||
|
}
|
||||||
|
|
||||||
|
type trackingResponseWriter struct {
|
||||||
|
http.ResponseWriter
|
||||||
|
wroteHeader bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) WriteHeader(statusCode int) {
|
||||||
|
w.wroteHeader = true
|
||||||
|
w.ResponseWriter.WriteHeader(statusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Write(p []byte) (int, error) {
|
||||||
|
w.wroteHeader = true
|
||||||
|
return w.ResponseWriter.Write(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func recoverMiddleware(next http.Handler) http.Handler {
|
||||||
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
tw := &trackingResponseWriter{ResponseWriter: w}
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
slog.Error("http handler panic",
|
||||||
|
"method", r.Method,
|
||||||
|
"path", r.URL.Path,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
if !tw.wroteHeader {
|
||||||
|
http.Error(tw, "internal server error", http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
next.ServeHTTP(tw, r)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Infrastructure handlers ──────────────────────────────────────────────────
|
// ── Infrastructure handlers ──────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -34,6 +34,23 @@ func TestChartLegendNumber(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
|
||||||
|
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
panic("boom")
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
|
||||||
|
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusInternalServerError {
|
||||||
|
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), "internal server error") {
|
||||||
|
t.Fatalf("body=%q", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
samples := []platform.LiveMetricSample{
|
samples := []platform.LiveMetricSample{
|
||||||
{
|
{
|
||||||
|
|||||||
42
audit/internal/webui/stability.go
Normal file
42
audit/internal/webui/stability.go
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"runtime/debug"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
if !runRecoverable(name, fn) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if restartDelay > 0 {
|
||||||
|
time.Sleep(restartDelay)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func goRecoverOnce(name string, fn func()) {
|
||||||
|
go func() {
|
||||||
|
_ = runRecoverable(name, fn)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func runRecoverable(name string, fn func()) (panicked bool) {
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
panicked = true
|
||||||
|
slog.Error("recovered panic",
|
||||||
|
"component", name,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
fn()
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -4,10 +4,12 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"runtime/debug"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -377,7 +379,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
|||||||
if !q.started {
|
if !q.started {
|
||||||
q.loadLocked()
|
q.loadLocked()
|
||||||
q.started = true
|
q.started = true
|
||||||
go q.worker()
|
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||||
}
|
}
|
||||||
hasPending := q.nextPending() != nil
|
hasPending := q.nextPending() != nil
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
@@ -392,75 +394,90 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
|||||||
func (q *taskQueue) worker() {
|
func (q *taskQueue) worker() {
|
||||||
for {
|
for {
|
||||||
<-q.trigger
|
<-q.trigger
|
||||||
setCPUGovernor("performance")
|
func() {
|
||||||
|
setCPUGovernor("performance")
|
||||||
|
defer setCPUGovernor("powersave")
|
||||||
|
|
||||||
// Drain all pending tasks and start them in parallel.
|
// Drain all pending tasks and start them in parallel.
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
var batch []*Task
|
var batch []*Task
|
||||||
for {
|
for {
|
||||||
t := q.nextPending()
|
t := q.nextPending()
|
||||||
if t == nil {
|
if t == nil {
|
||||||
break
|
break
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskRunning
|
||||||
|
t.StartedAt = &now
|
||||||
|
t.DoneAt = nil
|
||||||
|
t.ErrMsg = ""
|
||||||
|
j := newTaskJobState(t.LogPath)
|
||||||
|
t.job = j
|
||||||
|
batch = append(batch, t)
|
||||||
}
|
}
|
||||||
now := time.Now()
|
if len(batch) > 0 {
|
||||||
t.Status = TaskRunning
|
q.persistLocked()
|
||||||
t.StartedAt = &now
|
}
|
||||||
t.DoneAt = nil
|
q.mu.Unlock()
|
||||||
t.ErrMsg = ""
|
|
||||||
j := newTaskJobState(t.LogPath)
|
|
||||||
t.job = j
|
|
||||||
batch = append(batch, t)
|
|
||||||
}
|
|
||||||
if len(batch) > 0 {
|
|
||||||
q.persistLocked()
|
|
||||||
}
|
|
||||||
q.mu.Unlock()
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
for _, t := range batch {
|
for _, t := range batch {
|
||||||
t := t
|
t := t
|
||||||
j := t.job
|
j := t.job
|
||||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||||
j.cancel = taskCancel
|
j.cancel = taskCancel
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
goRecoverOnce("task "+t.Target, func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
msg := fmt.Sprintf("task panic: %v", rec)
|
||||||
|
slog.Error("task panic",
|
||||||
|
"task_id", t.ID,
|
||||||
|
"target", t.Target,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
j.append("ERROR: " + msg)
|
||||||
|
j.finish(msg)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||||
}
|
|
||||||
|
|
||||||
q.runTask(t, j, taskCtx)
|
|
||||||
|
|
||||||
if q.kmsgWatcher != nil {
|
|
||||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
|
||||||
}
|
|
||||||
|
|
||||||
q.mu.Lock()
|
|
||||||
now2 := time.Now()
|
|
||||||
t.DoneAt = &now2
|
|
||||||
if t.Status == TaskRunning {
|
|
||||||
if j.err != "" {
|
|
||||||
t.Status = TaskFailed
|
|
||||||
t.ErrMsg = j.err
|
|
||||||
} else {
|
|
||||||
t.Status = TaskDone
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
q.runTask(t, j, taskCtx)
|
||||||
|
|
||||||
|
if q.kmsgWatcher != nil {
|
||||||
|
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
q.mu.Lock()
|
||||||
|
now2 := time.Now()
|
||||||
|
t.DoneAt = &now2
|
||||||
|
if t.Status == TaskRunning {
|
||||||
|
if j.err != "" {
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = j.err
|
||||||
|
} else {
|
||||||
|
t.Status = TaskDone
|
||||||
|
}
|
||||||
|
}
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if len(batch) > 0 {
|
||||||
|
q.mu.Lock()
|
||||||
|
q.prune()
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
}()
|
}
|
||||||
}
|
}()
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
if len(batch) > 0 {
|
|
||||||
q.mu.Lock()
|
|
||||||
q.prune()
|
|
||||||
q.persistLocked()
|
|
||||||
q.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
setCPUGovernor("powersave")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ systemctl enable bee-preflight.service
|
|||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
systemctl enable bee-sshsetup.service
|
systemctl enable bee-sshsetup.service
|
||||||
|
systemctl enable bee-selfheal.timer
|
||||||
systemctl enable ssh.service
|
systemctl enable ssh.service
|
||||||
systemctl enable lightdm.service 2>/dev/null || true
|
systemctl enable lightdm.service 2>/dev/null || true
|
||||||
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||||
@@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
@@ -171,6 +171,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
|
||||||
|
ok "timer active: bee-selfheal.timer"
|
||||||
|
else
|
||||||
|
fail "timer NOT active: bee-selfheal.timer"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- runtime health --"
|
echo "-- runtime health --"
|
||||||
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
|
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
|
||||||
|
|||||||
9
iso/overlay/etc/systemd/system/bee-selfheal.service
Normal file
9
iso/overlay/etc/systemd/system/bee-selfheal.service
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: periodic runtime self-heal
|
||||||
|
After=bee-web.service bee-audit.service bee-preflight.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
11
iso/overlay/etc/systemd/system/bee-selfheal.timer
Normal file
11
iso/overlay/etc/systemd/system/bee-selfheal.timer
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: run self-heal checks periodically
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnBootSec=45sec
|
||||||
|
OnUnitActiveSec=60sec
|
||||||
|
AccuracySec=15sec
|
||||||
|
Unit=bee-selfheal.service
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
@@ -1,11 +1,12 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=2
|
RestartSec=3
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
LimitMEMLOCK=infinity
|
LimitMEMLOCK=infinity
|
||||||
|
|||||||
99
iso/overlay/usr/local/bin/bee-selfheal
Normal file
99
iso/overlay/usr/local/bin/bee-selfheal
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# bee-selfheal — periodic best-effort recovery for critical live ISO services.
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
LOG_PREFIX="bee-selfheal"
|
||||||
|
EXPORT_DIR="/appdata/bee/export"
|
||||||
|
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
|
||||||
|
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
|
||||||
|
LOCK_DIR="/run/bee-selfheal.lock"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[${LOG_PREFIX}] $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
have_nvidia_gpu() {
|
||||||
|
lspci -nn 2>/dev/null | grep -qi '10de:'
|
||||||
|
}
|
||||||
|
|
||||||
|
service_active() {
|
||||||
|
systemctl is-active --quiet "$1" 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_service() {
|
||||||
|
local svc="$1"
|
||||||
|
if systemctl restart "$svc" >/dev/null 2>&1; then
|
||||||
|
log "restarted ${svc}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "WARN: failed to restart ${svc}"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
file_ready() {
|
||||||
|
[ -s "$1" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
artifact_state() {
|
||||||
|
local path="$1"
|
||||||
|
if [ -s "${path}" ]; then
|
||||||
|
echo "ready"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if [ -e "${path}.tmp" ]; then
|
||||||
|
echo "interrupted"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "missing"
|
||||||
|
}
|
||||||
|
|
||||||
|
web_healthy() {
|
||||||
|
bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
|
||||||
|
>/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p "${EXPORT_DIR}" /run
|
||||||
|
|
||||||
|
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||||||
|
log "another self-heal run is already active"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||||
|
|
||||||
|
log "start"
|
||||||
|
|
||||||
|
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||||
|
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||||
|
restart_service bee-nvidia.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
|
||||||
|
if [ "${runtime_state}" != "ready" ]; then
|
||||||
|
if [ "${runtime_state}" = "interrupted" ]; then
|
||||||
|
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
|
||||||
|
else
|
||||||
|
log "runtime-health.json missing or empty"
|
||||||
|
fi
|
||||||
|
restart_service bee-preflight.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
audit_state="$(artifact_state "${AUDIT_JSON}")"
|
||||||
|
if [ "${audit_state}" != "ready" ]; then
|
||||||
|
if [ "${audit_state}" = "interrupted" ]; then
|
||||||
|
log "bee-audit.json.tmp exists — interrupted audit write detected"
|
||||||
|
else
|
||||||
|
log "bee-audit.json missing or empty"
|
||||||
|
fi
|
||||||
|
restart_service bee-audit.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! service_active bee-web.service; then
|
||||||
|
log "bee-web.service is not active"
|
||||||
|
restart_service bee-web.service || true
|
||||||
|
elif ! web_healthy; then
|
||||||
|
log "bee-web health check failed"
|
||||||
|
restart_service bee-web.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "done"
|
||||||
Reference in New Issue
Block a user