Add stability hardening and self-heal recovery

This commit is contained in:
2026-04-05 10:29:37 +03:00
parent 9826d437a5
commit 143b7dca5d
18 changed files with 495 additions and 111 deletions

View File

@@ -195,13 +195,11 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
return "stdout", err
case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return "", err
err := atomicWriteFile(path, append(data, '\n'), 0644)
if err == nil {
return path, nil
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
return path, nil
return "", err
default:
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
}
@@ -223,13 +221,11 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
return "stdout", err
case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return "", err
err := atomicWriteFile(path, append(data, '\n'), 0644)
if err == nil {
return path, nil
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
return path, nil
return "", err
default:
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
}

View File

@@ -0,0 +1,48 @@
package app
import (
"fmt"
"os"
"path/filepath"
)
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
}
tmpPath := path + ".tmp"
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
if err != nil {
return fmt.Errorf("open temp %s: %w", tmpPath, err)
}
success := false
defer func() {
_ = f.Close()
if !success {
_ = os.Remove(tmpPath)
}
}()
if _, err := f.Write(data); err != nil {
return fmt.Errorf("write temp %s: %w", tmpPath, err)
}
if err := f.Sync(); err != nil {
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
}
if err := f.Close(); err != nil {
return fmt.Errorf("close temp %s: %w", tmpPath, err)
}
if err := os.Rename(tmpPath, path); err != nil {
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
}
if dir, err := os.Open(filepath.Dir(path)); err == nil {
_ = dir.Sync()
_ = dir.Close()
}
success = true
return nil
}

View File

@@ -0,0 +1,71 @@
package app
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
path := filepath.Join(t.TempDir(), "bee-audit.json")
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
t.Fatalf("seed file: %v", err)
}
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
t.Fatalf("atomicWriteFile: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read final: %v", err)
}
if string(raw) != "new\n" {
t.Fatalf("final content=%q want %q", string(raw), "new\n")
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
}
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
path := filepath.Join(t.TempDir(), "runtime-health.json")
a := &App{
runtime: fakeRuntime{
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
return schema.RuntimeHealth{
Status: "OK",
ExportDir: exportDir,
DriverReady: true,
CUDAReady: true,
}, nil
},
},
}
got, err := a.RunRuntimePreflight("file:" + path)
if err != nil {
t.Fatalf("RunRuntimePreflight: %v", err)
}
if got != path {
t.Fatalf("path=%q want %q", got, path)
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read runtime file: %v", err)
}
var health schema.RuntimeHealth
if err := json.Unmarshal(raw, &health); err != nil {
t.Fatalf("json unmarshal: %v", err)
}
if health.Status != "OK" {
t.Fatalf("status=%q want OK", health.Status)
}
}

View File

@@ -19,6 +19,8 @@ var supportBundleServices = []string{
"bee-network.service",
"bee-nvidia.service",
"bee-preflight.service",
"bee-selfheal.service",
"bee-selfheal.timer",
"bee-sshsetup.service",
}