Add stability hardening and self-heal recovery
This commit is contained in:
@@ -195,13 +195,11 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
err := atomicWriteFile(path, append(data, '\n'), 0644)
|
||||
if err == nil {
|
||||
return path, nil
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
return "", err
|
||||
default:
|
||||
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
||||
}
|
||||
@@ -223,13 +221,11 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
err := atomicWriteFile(path, append(data, '\n'), 0644)
|
||||
if err == nil {
|
||||
return path, nil
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
return "", err
|
||||
default:
|
||||
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
||||
}
|
||||
|
||||
48
audit/internal/app/atomic_write.go
Normal file
48
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||
}
|
||||
|
||||
tmpPath := path + ".tmp"
|
||||
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||
}
|
||||
|
||||
success := false
|
||||
defer func() {
|
||||
_ = f.Close()
|
||||
if !success {
|
||||
_ = os.Remove(tmpPath)
|
||||
}
|
||||
}()
|
||||
|
||||
if _, err := f.Write(data); err != nil {
|
||||
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := os.Rename(tmpPath, path); err != nil {
|
||||
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||
}
|
||||
|
||||
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||
_ = dir.Sync()
|
||||
_ = dir.Close()
|
||||
}
|
||||
|
||||
success = true
|
||||
return nil
|
||||
}
|
||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||
t.Fatalf("seed file: %v", err)
|
||||
}
|
||||
|
||||
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||
t.Fatalf("atomicWriteFile: %v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read final: %v", err)
|
||||
}
|
||||
if string(raw) != "new\n" {
|
||||
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||
a := &App{
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||
return schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: exportDir,
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
}, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got, err := a.RunRuntimePreflight("file:" + path)
|
||||
if err != nil {
|
||||
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||
}
|
||||
if got != path {
|
||||
t.Fatalf("path=%q want %q", got, path)
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read runtime file: %v", err)
|
||||
}
|
||||
var health schema.RuntimeHealth
|
||||
if err := json.Unmarshal(raw, &health); err != nil {
|
||||
t.Fatalf("json unmarshal: %v", err)
|
||||
}
|
||||
if health.Status != "OK" {
|
||||
t.Fatalf("status=%q want OK", health.Status)
|
||||
}
|
||||
}
|
||||
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
|
||||
"bee-network.service",
|
||||
"bee-nvidia.service",
|
||||
"bee-preflight.service",
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user