Fix webui streaming recovery regressions

Add NVIDIA benchmark reporting flow
Add stability hardening and self-heal recovery
2026-04-05 10:39:09 +03:00 · 2026-04-05 10:30:56 +03:00 · 2026-04-05 10:29:37 +03:00 · 2026-04-05 09:57:38 +03:00 · 2026-04-04 15:23:15 +03:00 · 2026-04-04 15:18:43 +03:00
82 changed files with 6767 additions and 944 deletions
--- a/audit/Makefile
+++ b/audit/Makefile
@@ -1,5 +1,7 @@
 LISTEN ?= :8080
 AUDIT_PATH ?=
 VERSION ?= $(shell sh ./scripts/resolve-version.sh)
 GO_LDFLAGS := -X main.Version=$(VERSION)
 RUN_ARGS := web --listen $(LISTEN)
 ifneq ($(AUDIT_PATH),)
@@ -9,10 +11,10 @@ endif
 .PHONY: run build test
 run:
-	go run ./cmd/bee $(RUN_ARGS)
+	go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
 build:
-	go build -o bee ./cmd/bee
+	go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
 test:
 	go test ./...
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -8,6 +8,7 @@ import (
 	"log/slog"
 	"os"
 	"runtime/debug"
 	"strconv"
 	"strings"
 	"bee/audit/internal/app"
@@ -21,30 +22,7 @@ var Version = "dev"
 func buildLabel() string {
 	label := strings.TrimSpace(Version)
 	if label == "" {
-		label = "dev"
+		return "dev"
 	}
 	if info, ok := debug.ReadBuildInfo(); ok {
 		var revision string
 		var modified bool
 		for _, setting := range info.Settings {
 			switch setting.Key {
 			case "vcs.revision":
 				revision = setting.Value
 			case "vcs.modified":
 				modified = setting.Value == "true"
 			}
 		}
 		if revision != "" {
 			short := revision
 			if len(short) > 12 {
 				short = short[:12]
 			}
 			label += " (" + short
 			if modified {
 				label += "+"
 			}
 			label += ")"
 		}
 	}
 	return label
 }
@@ -53,10 +31,19 @@ func main() {
 	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
 }
-func run(args []string, stdout, stderr io.Writer) int {
+func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level: slog.LevelInfo,
 	})))
 	defer func() {
 		if rec := recover(); rec != nil {
 			slog.Error("fatal panic",
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 			exitCode = 1
 		}
 	}()
 	if len(args) == 0 {
 		printRootUsage(stderr)
@@ -82,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
 	case "benchmark":
 		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -100,6 +89,7 @@ func printRootUsage(w io.Writer) {
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
 }
@@ -118,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
 	case "benchmark":
 		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -407,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
 func runBenchmark(args []string, stdout, stderr io.Writer) int {
 	if len(args) == 0 {
 		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 2
 	}
 	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
 		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 0
 	}
 	target := args[0]
 	if target != "nvidia" {
 		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
 		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 2
 	}
 	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
 	devices := fs.String("devices", "", "comma-separated GPU indices to include")
 	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
 	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
 	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
 	if err := fs.Parse(args[1:]); err != nil {
 		if err == flag.ErrHelp {
 			return 0
 		}
 		return 2
 	}
 	if fs.NArg() != 0 {
 		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
 		return 2
 	}
 	includeIndices, err := parseBenchmarkIndexCSV(*devices)
 	if err != nil {
 		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
 		return 2
 	}
 	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
 	if err != nil {
 		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
 		return 2
 	}
 	application := app.New(platform.New())
 	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
 	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
 		Profile:           *profile,
 		SizeMB:            *sizeMB,
 		GPUIndices:        includeIndices,
 		ExcludeGPUIndices: excludeIndices,
 		RunNCCL:           !*skipNCCL,
 	}, logLine)
 	if err != nil {
 		slog.Error("run benchmark", "target", target, "err", err)
 		return 1
 	}
 	slog.Info("benchmark archive written", "target", target, "path", archive)
 	return 0
 }
 func parseBenchmarkIndexCSV(raw string) ([]int, error) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return nil, nil
 	}
 	var indices []int
 	for _, part := range strings.Split(raw, ",") {
 		part = strings.TrimSpace(part)
 		if part == "" {
 			continue
 		}
 		value, err := strconv.Atoi(part)
 		if err != nil || value < 0 {
 			return nil, fmt.Errorf("bad gpu index %q", part)
 		}
 		indices = append(indices, value)
 	}
 	return indices, nil
 }
--- a/audit/cmd/bee/main_test.go
+++ b/audit/cmd/bee/main_test.go
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
 }
 func TestRunVersion(t *testing.T) {
 	t.Parallel()
 	old := Version
 	Version = "test-version"
 	t.Cleanup(func() { Version = old })
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
 	}
 }
 func TestBuildLabelUsesVersionAsIs(t *testing.T) {
 	old := Version
 	Version = "1.2.3"
 	t.Cleanup(func() { Version = old })
 	if got := buildLabel(); got != "1.2.3" {
 		t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
 	}
 }
 func TestRunExportRequiresTarget(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,17 +19,18 @@ import (
 )
 var (
-	DefaultExportDir       = "/appdata/bee/export"
+	DefaultExportDir        = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
+	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
+	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
+	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
+	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
+	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
+	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
 	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
 )
 type App struct {
@@ -40,6 +41,8 @@ type App struct {
 	sat       satRunner
 	runtime   runtimeChecker
 	installer installer
 	// StatusDB is the unified component health store (nil if unavailable).
 	StatusDB *ComponentStatusDB
 }
 type ActionResult struct {
@@ -80,6 +83,7 @@ type installer interface {
 	ListInstallDisks() ([]platform.InstallDisk, error)
 	InstallToDisk(ctx context.Context, device string, logFile string) error
 	IsLiveMediaInRAM() bool
 	LiveBootSource() platform.LiveBootSource
 	RunInstallToRAM(ctx context.Context, logFunc func(string)) error
 }
@@ -100,6 +104,10 @@ func (a *App) IsLiveMediaInRAM() bool {
 	return a.installer.IsLiveMediaInRAM()
 }
 func (a *App) LiveBootSource() platform.LiveBootSource {
 	return a.installer.LiveBootSource()
 }
 func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 	return a.installer.RunInstallToRAM(ctx, logFunc)
 }
@@ -107,6 +115,7 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 type satRunner interface {
 	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -131,7 +140,7 @@ type runtimeChecker interface {
 }
 func New(platform *platform.System) *App {
-	return &App{
+	a := &App{
 		network:   platform,
 		services:  platform,
 		exports:   platform,
@@ -140,19 +149,32 @@ func New(platform *platform.System) *App {
 		runtime:   platform,
 		installer: platform,
 	}
 	if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
 		a.StatusDB = db
 	}
 	return a
 }
 // ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
 // and returns the updated JSON. Used by the web UI to serve always-fresh status.
 func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
-	var snap schema.HardwareIngestRequest
+	snap, err := readAuditSnapshot(auditJSON)
-	if err := json.Unmarshal(auditJSON, &snap); err != nil {
+	if err != nil {
 		return nil, err
 	}
-	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
 	return json.MarshalIndent(snap, "", "  ")
 }
 func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
 	var snap schema.HardwareIngestRequest
 	if err := json.Unmarshal(auditJSON, &snap); err != nil {
 		return schema.HardwareIngestRequest{}, err
 	}
 	collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
 	return snap, nil
 }
 func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
 	if runtimeMode == runtimeenv.ModeLiveCD {
 		if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
@@ -160,7 +182,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		}
 	}
 	result := collector.Run(runtimeMode)
-	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -175,10 +197,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -203,10 +222,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -276,6 +292,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	if err != nil {
 		return "", err
 	}
 	if normalized, normErr := ApplySATOverlay(data); normErr == nil {
 		data = normalized
 	}
 	if err := os.WriteFile(tmpPath, data, 0644); err != nil {
 		return "", err
 	}
@@ -513,6 +532,17 @@ func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOpti
 	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBenchmarkBaseDir
 	}
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -733,6 +763,7 @@ func (a *App) HealthSummaryResult() ActionResult {
 	if err := json.Unmarshal(raw, &snapshot); err != nil {
 		return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
 	}
 	collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
 	summary := collector.BuildHealthSummary(snapshot.Hardware)
 	var body strings.Builder
@@ -767,6 +798,7 @@ func (a *App) MainBanner() string {
 	if err := json.Unmarshal(raw, &snapshot); err != nil {
 		return ""
 	}
 	collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
 	var lines []string
 	if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -120,15 +120,16 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }
 type fakeSAT struct {
-	runNvidiaFn       func(string) (string, error)
+	runNvidiaFn          func(string) (string, error)
-	runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
+	runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
-	runMemoryFn       func(string) (string, error)
+	runNvidiaStressFn    func(string, platform.NvidiaStressOptions) (string, error)
-	runStorageFn      func(string) (string, error)
+	runMemoryFn          func(string) (string, error)
-	runCPUFn          func(string, int) (string, error)
+	runStorageFn         func(string) (string, error)
-	detectVendorFn    func() string
+	runCPUFn             func(string, int) (string, error)
-	listAMDGPUsFn     func() ([]platform.AMDGPUInfo, error)
+	detectVendorFn       func() string
-	runAMDPackFn      func(string) (string, error)
+	listAMDGPUsFn        func() ([]platform.AMDGPUInfo, error)
-	listNvidiaGPUsFn  func() ([]platform.NvidiaGPU, error)
+	runAMDPackFn         func(string) (string, error)
 	listNvidiaGPUsFn     func() ([]platform.NvidiaGPU, error)
 }
 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +140,13 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
 	if f.runNvidiaBenchmarkFn != nil {
 		return f.runNvidiaBenchmarkFn(baseDir, opts)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
 	if f.runNvidiaStressFn != nil {
 		return f.runNvidiaStressFn(baseDir, opts)
@@ -660,13 +668,50 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
 	}
 }
 func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
 	tmp := t.TempDir()
 	oldSATBaseDir := DefaultSATBaseDir
 	DefaultSATBaseDir = filepath.Join(tmp, "sat")
 	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
 	raw := `{
 	  "collected_at": "2026-03-15T10:00:00Z",
 	  "hardware": {
 	    "board": {"serial_number": "SRV123"},
 	    "storage": [
 	      {"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
 	      {"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
 	    ],
 	    "pcie_devices": [
 	      {"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
 	      {"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
 	    ]
 	  }
 	}`
 	got, err := ApplySATOverlay([]byte(raw))
 	if err != nil {
 		t.Fatalf("ApplySATOverlay error: %v", err)
 	}
 	text := string(got)
 	if contains(text, "Virtual HDisk0") {
 		t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
 	}
 	if contains(text, "\"device_class\": \"Co-processor\"") {
 		t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
 	}
 	if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
 		t.Fatalf("overlaid audit should keep real devices:\n%s", text)
 	}
 }
 func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tmp := t.TempDir()
 	exportDir := filepath.Join(tmp, "export")
 	if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
 		t.Fatal(err)
 	}
-	if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
@@ -698,6 +743,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tr := tar.NewReader(gzr)
 	var names []string
 	var auditJSON string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -707,6 +753,33 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			t.Fatalf("read tar entry: %v", err)
 		}
 		names = append(names, hdr.Name)
 		if contains(hdr.Name, "/export/bee-audit.json") {
 			body, err := io.ReadAll(tr)
 			if err != nil {
 				t.Fatalf("read audit entry: %v", err)
 			}
 			auditJSON = string(body)
 		}
 	}
 	for _, want := range []string{
 		"/system/ip-link.txt",
 		"/system/ip-link-stats.txt",
 		"/system/ethtool-info.txt",
 		"/system/ethtool-link.txt",
 		"/system/ethtool-module.txt",
 		"/system/mstflint-query.txt",
 	} {
 		var found bool
 		for _, name := range names {
 			if contains(name, want) {
 				found = true
 				break
 			}
 		}
 		if !found {
 			t.Fatalf("support bundle missing %s, names=%v", want, names)
 		}
 	}
 	var foundRaw bool
@@ -721,6 +794,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !foundRaw {
 		t.Fatalf("support bundle missing raw SAT log, names=%v", names)
 	}
 	if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
 		t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
 	}
 	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
 		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
 	}
 }
 func TestMainBanner(t *testing.T) {
@@ -734,6 +813,10 @@ func TestMainBanner(t *testing.T) {
 	product := "PowerEdge R760"
 	cpuModel := "Intel Xeon Gold 6430"
 	memoryType := "DDR5"
 	memorySerialA := "DIMM-A"
 	memorySerialB := "DIMM-B"
 	storageSerialA := "DISK-A"
 	storageSerialB := "DISK-B"
 	gpuClass := "VideoController"
 	gpuModel := "NVIDIA H100"
@@ -749,12 +832,12 @@ func TestMainBanner(t *testing.T) {
 				{Model: &cpuModel},
 			},
 			Memory: []schema.HardwareMemory{
-				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
+				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
-				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
+				{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
 			},
 			Storage: []schema.HardwareStorage{
-				{Present: &trueValue, SizeGB: intPtr(3840)},
+				{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
-				{Present: &trueValue, SizeGB: intPtr(3840)},
+				{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
 			},
 			PCIeDevices: []schema.HardwarePCIeDevice{
 				{DeviceClass: &gpuClass, Model: &gpuModel},
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -0,0 +1,48 @@
 package app
 import (
 	"fmt"
 	"os"
 	"path/filepath"
 )
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
 	}
 	tmpPath := path + ".tmp"
 	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
 	if err != nil {
 		return fmt.Errorf("open temp %s: %w", tmpPath, err)
 	}
 	success := false
 	defer func() {
 		_ = f.Close()
 		if !success {
 			_ = os.Remove(tmpPath)
 		}
 	}()
 	if _, err := f.Write(data); err != nil {
 		return fmt.Errorf("write temp %s: %w", tmpPath, err)
 	}
 	if err := f.Sync(); err != nil {
 		return fmt.Errorf("sync temp %s: %w", tmpPath, err)
 	}
 	if err := f.Close(); err != nil {
 		return fmt.Errorf("close temp %s: %w", tmpPath, err)
 	}
 	if err := os.Rename(tmpPath, path); err != nil {
 		return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
 	}
 	if dir, err := os.Open(filepath.Dir(path)); err == nil {
 		_ = dir.Sync()
 		_ = dir.Close()
 	}
 	success = true
 	return nil
 }
--- a/audit/internal/app/atomic_write_test.go
+++ b/audit/internal/app/atomic_write_test.go
@@ -0,0 +1,71 @@
 package app
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"testing"
 	"bee/audit/internal/schema"
 )
 func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "bee-audit.json")
 	if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
 		t.Fatalf("seed file: %v", err)
 	}
 	if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
 		t.Fatalf("atomicWriteFile: %v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("read final: %v", err)
 	}
 	if string(raw) != "new\n" {
 		t.Fatalf("final content=%q want %q", string(raw), "new\n")
 	}
 	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
 		t.Fatalf("tmp file should be absent after success, err=%v", err)
 	}
 }
 func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "runtime-health.json")
 	a := &App{
 		runtime: fakeRuntime{
 			collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
 				return schema.RuntimeHealth{
 					Status:      "OK",
 					ExportDir:   exportDir,
 					DriverReady: true,
 					CUDAReady:   true,
 				}, nil
 			},
 		},
 	}
 	got, err := a.RunRuntimePreflight("file:" + path)
 	if err != nil {
 		t.Fatalf("RunRuntimePreflight: %v", err)
 	}
 	if got != path {
 		t.Fatalf("path=%q want %q", got, path)
 	}
 	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
 		t.Fatalf("tmp file should be absent after success, err=%v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("read runtime file: %v", err)
 	}
 	var health schema.RuntimeHealth
 	if err := json.Unmarshal(raw, &health); err != nil {
 		t.Fatalf("json unmarshal: %v", err)
 	}
 	if health.Status != "OK" {
 		t.Fatalf("status=%q want OK", health.Status)
 	}
 }
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -0,0 +1,266 @@
 package app
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 )
 // ComponentStatusDB is a persistent, append-only store of hardware component health records.
 // Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
 // Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
 // the component stays at the highest observed severity until explicitly reset.
 type ComponentStatusDB struct {
 	path    string
 	mu      sync.Mutex
 	records map[string]*ComponentStatusRecord
 }
 // ComponentStatusRecord holds the current and historical health of one hardware component.
 type ComponentStatusRecord struct {
 	ComponentKey  string                  `json:"component_key"`
 	Status        string                  `json:"status"` // "OK", "Warning", "Critical", "Unknown"
 	LastCheckedAt time.Time               `json:"last_checked_at"`
 	LastChangedAt time.Time               `json:"last_changed_at"`
 	ErrorSummary  string                  `json:"error_summary,omitempty"`
 	History       []ComponentStatusEntry  `json:"history"`
 }
 // ComponentStatusEntry is one observation written to a component's history.
 type ComponentStatusEntry struct {
 	At     time.Time `json:"at"`
 	Status string    `json:"status"`
 	Source string    `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
 	Detail string    `json:"detail,omitempty"`
 }
 // OpenComponentStatusDB opens (or creates) the JSON status DB at path.
 func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	db := &ComponentStatusDB{
 		path:    path,
 		records: make(map[string]*ComponentStatusRecord),
 	}
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
 	data, err := os.ReadFile(path)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
 	if len(data) > 0 {
 		var records []ComponentStatusRecord
 		if err := json.Unmarshal(data, &records); err == nil {
 			for i := range records {
 				db.records[records[i].ComponentKey] = &records[i]
 			}
 		}
 	}
 	return db, nil
 }
 // Record writes one observation for the given component key.
 // source is a short label like "sat:nvidia" or "watchdog:kmsg".
 // status is "OK", "Warning", "Critical", or "Unknown".
 // OK never downgrades an existing Warning or Critical status.
 func (db *ComponentStatusDB) Record(key, source, status, detail string) {
 	if db == nil || strings.TrimSpace(key) == "" {
 		return
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	now := time.Now().UTC()
 	rec, exists := db.records[key]
 	if !exists {
 		rec = &ComponentStatusRecord{ComponentKey: key}
 		db.records[key] = rec
 	}
 	rec.LastCheckedAt = now
 	entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
 	rec.History = append(rec.History, entry)
 	// Status merge: OK never downgrades Warning/Critical.
 	newSev := componentSeverity(status)
 	curSev := componentSeverity(rec.Status)
 	if newSev > curSev {
 		rec.Status = status
 		rec.LastChangedAt = now
 		rec.ErrorSummary = detail
 	} else if rec.Status == "" {
 		rec.Status = status
 		rec.LastChangedAt = now
 	}
 	_ = db.saveLocked()
 }
 // Get returns the current record for a component key.
 func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
 	if db == nil {
 		return ComponentStatusRecord{}, false
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	r, ok := db.records[key]
 	if !ok {
 		return ComponentStatusRecord{}, false
 	}
 	return *r, true
 }
 // All returns a snapshot of all records.
 func (db *ComponentStatusDB) All() []ComponentStatusRecord {
 	if db == nil {
 		return nil
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	out := make([]ComponentStatusRecord, 0, len(db.records))
 	for _, r := range db.records {
 		out = append(out, *r)
 	}
 	return out
 }
 func (db *ComponentStatusDB) saveLocked() error {
 	records := make([]ComponentStatusRecord, 0, len(db.records))
 	for _, r := range db.records {
 		records = append(records, *r)
 	}
 	data, err := json.MarshalIndent(records, "", "  ")
 	if err != nil {
 		return err
 	}
 	return os.WriteFile(db.path, data, 0644)
 }
 // componentSeverity returns a numeric severity so higher values win.
 func componentSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
 		return 3
 	case "Warning":
 		return 2
 	case "OK":
 		return 1
 	default:
 		return 0
 	}
 }
 // ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
 // and writes component status records to db for the given SAT target.
 // archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
 func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
 	if db == nil || strings.TrimSpace(archivePath) == "" {
 		return
 	}
 	archivePath = extractArchivePath(archivePath)
 	if archivePath == "" {
 		return
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return
 	}
 	kv := parseSATKV(string(data))
 	overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
 	if overall == "" {
 		return
 	}
 	source := "sat:" + target
 	dbStatus := satStatusToDBStatus(overall)
 	// Map SAT target to component keys.
 	switch target {
 	case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
 		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
 	case "memory", "memory-stress", "sat-stress":
 		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
 	case "cpu", "platform-stress":
 		db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
 	case "storage":
 		// Try to record per-device if available in summary.
 		recordedAny := false
 		for key, val := range kv {
 			if !strings.HasSuffix(key, "_status") || key == "overall_status" {
 				continue
 			}
 			base := strings.TrimSuffix(key, "_status")
 			idx := strings.Index(base, "_")
 			if idx <= 0 {
 				continue
 			}
 			devName := base[:idx]
 			devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
 			db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
 			recordedAny = true
 		}
 		if !recordedAny {
 			db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
 		}
 	}
 }
 func satStatusToDBStatus(overall string) string {
 	switch overall {
 	case "OK":
 		return "OK"
 	case "FAILED":
 		return "Warning"
 	case "PARTIAL", "UNSUPPORTED":
 		return "Unknown"
 	default:
 		return "Unknown"
 	}
 }
 // ExtractArchivePath extracts a bare .tar.gz path from a string that may be
 // "Archive written to /path/foo.tar.gz" or already a bare path.
 func ExtractArchivePath(s string) string {
 	return extractArchivePath(s)
 }
 // ReadSATOverallStatus reads the overall_status value from the summary.txt
 // file located in the run directory alongside archivePath.
 // Returns "" if the file cannot be read.
 func ReadSATOverallStatus(archivePath string) string {
 	if strings.TrimSpace(archivePath) == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return ""
 	}
 	kv := parseSATKV(string(data))
 	return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
 }
 func extractArchivePath(s string) string {
 	s = strings.TrimSpace(s)
 	if strings.HasSuffix(s, ".tar.gz") {
 		parts := strings.Fields(s)
 		if len(parts) > 0 {
 			return parts[len(parts)-1]
 		}
 	}
 	return s
 }
 func parseSATKV(raw string) map[string]string {
 	kv := make(map[string]string)
 	for _, line := range strings.Split(raw, "\n") {
 		k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
 		if ok {
 			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
 		}
 	}
 	return kv
 }
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -9,7 +9,7 @@ import (
 	"bee/audit/internal/schema"
 )
-func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
+func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
 	if snap == nil || strings.TrimSpace(baseDir) == "" {
 		return
 	}
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
 	if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
 		applyStorageSAT(snap.Storage, summary)
 	}
 	// Apply unified component status DB — overlaid last so it can only upgrade severity.
 	applyComponentStatusDB(snap, db)
 }
 type satSummary struct {
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
 	}
 }
 func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
 	if snap == nil || db == nil {
 		return
 	}
 	for _, rec := range db.All() {
 		key := rec.ComponentKey
 		status := dbStatusToSATStatus(rec.Status)
 		if status == "" {
 			continue
 		}
 		detail := rec.ErrorSummary
 		ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
 		switch {
 		case strings.HasPrefix(key, "pcie:"):
 			bdf := strings.TrimPrefix(key, "pcie:")
 			bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
 			// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
 			if sanitizeBDFForLookup(bdf) == "" {
 				break
 			}
 			normalized := sanitizeBDFForLookup(bdf)
 			for i := range snap.PCIeDevices {
 				if snap.PCIeDevices[i].BDF == nil {
 					continue
 				}
 				if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
 					mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
 				}
 			}
 		case strings.HasPrefix(key, "storage:"):
 			devName := strings.TrimPrefix(key, "storage:")
 			if devName == "all" {
 				for i := range snap.Storage {
 					mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
 				}
 			} else {
 				for i := range snap.Storage {
 					linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
 					if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
 						mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
 					}
 				}
 			}
 		case strings.HasPrefix(key, "memory:"):
 			for i := range snap.Memory {
 				mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
 			}
 		case strings.HasPrefix(key, "cpu:"):
 			for i := range snap.CPUs {
 				mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
 			}
 		}
 	}
 }
 // dbStatusToSATStatus converts ComponentStatusDB status strings to the format
 // expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
 func dbStatusToSATStatus(s string) string {
 	switch strings.TrimSpace(s) {
 	case "OK", "Warning", "Critical", "Unknown":
 		return s
 	default:
 		return ""
 	}
 }
 // sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
 // suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
 func sanitizeBDFForLookup(bdf string) string {
 	bdf = strings.ToLower(strings.TrimSpace(bdf))
 	if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
 		return ""
 	}
 	if strings.Count(bdf, ":") == 1 {
 		bdf = "0000:" + bdf
 	}
 	return bdf
 }
 func ptrString(v *string) string {
 	if v == nil {
 		return ""
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
 	usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
 	snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
 		t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		}},
 	}
-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
 	"bee-network.service",
 	"bee-nvidia.service",
 	"bee-preflight.service",
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 }
@@ -27,13 +29,114 @@ var supportBundleCommands = []struct {
 	cmd  []string
 }{
 	{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
 	{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
 	{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
 	{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
 	{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
 	{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
 	{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
 	{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
 	{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
-	{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
+	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
 	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
  [ "$vendor" = "0x10de" ] || continue
  dev=$(basename "$d")
  echo "=== $dev ==="
  for f in current_link_speed current_link_width max_link_speed max_link_width; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
 `}},
 	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool -i "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool -m "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
 if ! command -v mstflint >/dev/null 2>&1; then
  echo "mstflint not found"
  exit 0
 fi
 found=0
 for path in /sys/bus/pci/devices/*; do
  [ -e "$path/vendor" ] || continue
  vendor=$(cat "$path/vendor" 2>/dev/null)
  [ "$vendor" = "0x15b3" ] || continue
  bdf=$(basename "$path")
  found=1
  echo "=== $bdf ==="
  mstflint -d "$bdf" q 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no Mellanox/NVIDIA networking devices found"
 fi
 `}},
 }
 var supportBundleOptionalFiles = []struct {
 	name string
 	src  string
 }{
 	{name: "system/kern.log", src: "/var/log/kern.log"},
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
 }
 const supportBundleGlob = "bee-support-*.tar.gz"
@@ -77,6 +180,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
 			return "", err
 		}
 	}
 	for _, item := range supportBundleOptionalFiles {
 		_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
 	}
 	if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
 		return "", err
 	}
@@ -184,6 +290,24 @@ func writeCommandOutput(dst string, cmd []string) error {
 	return os.WriteFile(dst, raw, 0644)
 }
 func copyOptionalFile(src, dst string) error {
 	in, err := os.Open(src)
 	if err != nil {
 		return err
 	}
 	defer in.Close()
 	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
 		return err
 	}
 	out, err := os.Create(dst)
 	if err != nil {
 		return err
 	}
 	defer out.Close()
 	_, err = io.Copy(out, in)
 	return err
 }
 func writeManifest(dst, exportDir, stageRoot string) error {
 	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
 		return err
@@ -247,7 +371,7 @@ func copyDirContents(srcDir, dstDir string) error {
 }
 func copyExportDirForSupportBundle(srcDir, dstDir string) error {
-	return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
+	if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
 		cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
 		if cleanRel == "" {
 			return true
@@ -259,7 +383,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
 			return false
 		}
 		return true
-	})
+	}); err != nil {
 		return err
 	}
 	return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
 }
 func normalizeSupportBundleAuditJSON(path string) error {
 	data, err := os.ReadFile(path)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return nil
 		}
 		return err
 	}
 	normalized, err := ApplySATOverlay(data)
 	if err != nil {
 		return nil
 	}
 	return os.WriteFile(path, normalized, 0644)
 }
 func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
--- a/audit/internal/collector/finalize.go
+++ b/audit/internal/collector/finalize.go
@@ -1,10 +1,18 @@
 package collector
-import "bee/audit/internal/schema"
+import (
 	"bee/audit/internal/schema"
 	"strings"
 )
 func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
 	finalizeSnapshot(snap, collectedAt)
 }
 func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
 	snap.Memory = filterMemory(snap.Memory)
 	snap.Storage = filterStorage(snap.Storage)
 	snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
 	snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
 	setComponentStatusMetadata(snap, collectedAt)
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
 		if disk.SerialNumber == nil || *disk.SerialNumber == "" {
 			continue
 		}
 		if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
 			continue
 		}
 		out = append(out, disk)
 	}
 	return out
 }
 func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
 	out := make([]schema.HardwarePCIeDevice, 0, len(devs))
 	for _, dev := range devs {
 		if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
 			continue
 		}
 		out = append(out, dev)
 	}
 	return out
 }
 func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
 	out := make([]schema.HardwarePowerSupply, 0, len(psus))
 	for _, psu := range psus {
--- a/audit/internal/collector/finalize_test.go
+++ b/audit/internal/collector/finalize_test.go
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 	present := true
 	status := statusOK
 	serial := "SN-1"
 	virtualModel := "Virtual HDisk1"
 	realModel := "PASCARI"
 	coProcessorClass := "Co-processor"
 	gpuClass := "VideoController"
 	snap := schema.HardwareSnapshot{
 		Memory: []schema.HardwareMemory{
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 			{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 		},
 		Storage: []schema.HardwareStorage{
 			{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 		},
 		PCIeDevices: []schema.HardwarePCIeDevice{
 			{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 		},
 		PowerSupplies: []schema.HardwarePowerSupply{
 			{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
 			{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
 	if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
 	}
-	if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
+	if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
 	}
 	if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
 		t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
 	}
 	if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
 		t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
 	}
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -2,18 +2,21 @@ package collector
 import (
 	"bee/audit/internal/schema"
 	"context"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"time"
 )
 const mellanoxVendorID = 0x15b3
 const nicProbeTimeout = 2 * time.Second
 var (
 	mstflintQuery = func(bdf string) (string, error) {
-		out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
 		if err != nil {
 			return "", err
 		}
@@ -21,7 +24,7 @@ var (
 	}
 	ethtoolInfoQuery = func(iface string) (string, error) {
-		out, err := exec.Command("ethtool", "-i", iface).Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
 		if err != nil {
 			return "", err
 		}
@@ -29,6 +32,14 @@ var (
 	}
 	netIfacesByBDF = listNetIfacesByBDF
 	readNetCarrierFile = func(iface string) (string, error) {
 		path := filepath.Join("/sys/class/net", iface, "carrier")
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			return "", err
 		}
 		return strings.TrimSpace(string(raw)), nil
 	}
 )
 // enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
 	}
 	return ifaces
 }
 func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	defer cancel()
 	return exec.CommandContext(ctx, name, args...).Output()
 }
 func interfaceHasCarrier(iface string) bool {
 	raw, err := readNetCarrierFile(iface)
 	if err != nil {
 		return false
 	}
 	return strings.TrimSpace(raw) == "1"
 }
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -12,7 +12,7 @@ import (
 var (
 	ethtoolModuleQuery = func(iface string) (string, error) {
-		out, err := raidToolQuery("ethtool", "-m", iface)
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
 		if err != nil {
 			return "", err
 		}
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}
-		if out, err := ethtoolModuleQuery(iface); err == nil {
+		if interfaceHasCarrier(iface) {
-			if injectSFPDOMTelemetry(&devs[i], out) {
+			if out, err := ethtoolModuleQuery(iface); err == nil {
-				enriched++
+				if injectSFPDOMTelemetry(&devs[i], out) {
-				continue
+					enriched++
 					continue
 				}
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
 	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		queryPCILSPCIDetail = origDetail
 		readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
 		readNetCarrierFile = origCarrier
 	})
 	queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		}
 		return "aa:bb:cc:dd:ee:ff", nil
 	}
 	readNetCarrierFile = func(string) (string, error) { return "1", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	}
 }
 func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
 	origIfaces := netIfacesByBDF
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
 	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		netIfacesByBDF = origIfaces
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
 		readNetCarrierFile = origCarrier
 	})
 	netIfacesByBDF = func(string) []string { return []string{"eth0"} }
 	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
 	readNetCarrierFile = func(string) (string, error) { return "0", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) {
 		t.Fatal("ethtool -m should not be called without carrier")
 		return "", nil
 	}
 	class := "EthernetController"
 	bdf := "0000:18:00.0"
 	devs := []schema.HardwarePCIeDevice{{
 		DeviceClass: &class,
 		BDF:         &bdf,
 	}}
 	out := enrichPCIeWithNICTelemetry(devs)
 	if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
 		t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
 	}
 }
 func TestDBMValue(t *testing.T) {
 	tests := []struct {
 		in   string
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -13,14 +13,18 @@ import (
 const nvidiaVendorID = 0x10de
 type nvidiaGPUInfo struct {
-	BDF            string
+	BDF                string
-	Serial         string
+	Serial             string
-	VBIOS          string
+	VBIOS              string
-	TemperatureC   *float64
+	TemperatureC       *float64
-	PowerW         *float64
+	PowerW             *float64
-	ECCUncorrected *int64
+	ECCUncorrected     *int64
-	ECCCorrected   *int64
+	ECCCorrected       *int64
-	HWSlowdown     *bool
+	HWSlowdown         *bool
 	PCIeLinkGenCurrent *int
 	PCIeLinkGenMax     *int
 	PCIeLinkWidthCur   *int
 	PCIeLinkWidthMax   *int
 }
 // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
 	out, err := exec.Command(
 		"nvidia-smi",
-		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
+		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		if len(rec) == 0 {
 			continue
 		}
-		if len(rec) < 9 {
+		if len(rec) < 13 {
-			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
+			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
 		}
 		bdf := normalizePCIeBDF(rec[1])
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		}
 		info := nvidiaGPUInfo{
-			BDF:            bdf,
+			BDF:                bdf,
-			Serial:         strings.TrimSpace(rec[2]),
+			Serial:             strings.TrimSpace(rec[2]),
-			VBIOS:          strings.TrimSpace(rec[3]),
+			VBIOS:              strings.TrimSpace(rec[3]),
-			TemperatureC:   parseMaybeFloat(rec[4]),
+			TemperatureC:       parseMaybeFloat(rec[4]),
-			PowerW:         parseMaybeFloat(rec[5]),
+			PowerW:             parseMaybeFloat(rec[5]),
-			ECCUncorrected: parseMaybeInt64(rec[6]),
+			ECCUncorrected:     parseMaybeInt64(rec[6]),
-			ECCCorrected:   parseMaybeInt64(rec[7]),
+			ECCCorrected:       parseMaybeInt64(rec[7]),
-			HWSlowdown:     parseMaybeBool(rec[8]),
+			HWSlowdown:         parseMaybeBool(rec[8]),
 			PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
 			PCIeLinkGenMax:     parseMaybeInt(rec[10]),
 			PCIeLinkWidthCur:   parseMaybeInt(rec[11]),
 			PCIeLinkWidthMax:   parseMaybeInt(rec[12]),
 		}
 		result[bdf] = info
 	}
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
 	return &n
 }
 func parseMaybeInt(v string) *int {
 	v = strings.TrimSpace(v)
 	if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
 		return nil
 	}
 	n, err := strconv.Atoi(v)
 	if err != nil {
 		return nil
 	}
 	return &n
 }
 func pcieLinkGenLabel(gen int) string {
 	return fmt.Sprintf("Gen%d", gen)
 }
 func parseMaybeBool(v string) *bool {
 	v = strings.TrimSpace(strings.ToLower(v))
 	switch v {
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
 	if info.HWSlowdown != nil {
 		dev.HWSlowdown = info.HWSlowdown
 	}
 	// Override PCIe link speed/width with nvidia-smi driver values.
 	// sysfs current_link_speed reflects the instantaneous physical link state and
 	// can show Gen1 when the GPU is idle due to ASPM power management. The driver
 	// knows the negotiated speed regardless of the current power state.
 	if info.PCIeLinkGenCurrent != nil {
 		s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
 		dev.LinkSpeed = &s
 	}
 	if info.PCIeLinkGenMax != nil {
 		s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
 		dev.MaxLinkSpeed = &s
 	}
 	if info.PCIeLinkWidthCur != nil {
 		dev.LinkWidth = info.PCIeLinkWidthCur
 	}
 	if info.PCIeLinkWidthMax != nil {
 		dev.MaxLinkWidth = info.PCIeLinkWidthMax
 	}
 }
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -6,7 +6,7 @@ import (
 )
 func TestParseNVIDIASMIQuery(t *testing.T) {
-	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
+	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
 	byBDF, err := parseNVIDIASMIQuery(raw)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
 		t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
 	}
 	if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
 		t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
 	}
 	if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
 		t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
 	}
 }
 func TestNormalizePCIeBDF(t *testing.T) {
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -59,6 +59,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		"host bridge",
 		"isa bridge",
 		"pci bridge",
 		"co-processor",
 		"performance counter",
 		"performance counters",
 		"ram memory",
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -19,6 +19,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "audio", class: "Audio device", want: false},
 		{name: "host bridge", class: "Host bridge", want: false},
 		{name: "pci bridge", class: "PCI bridge", want: false},
 		{name: "co-processor", class: "Co-processor", want: false},
 		{name: "smbus", class: "SMBus", want: false},
 		{name: "perf", class: "Performance counters", want: false},
 		{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
@@ -76,6 +77,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
 	}
 }
 func TestParseLspci_filtersCoProcessors(t *testing.T) {
 	input := "" +
 		"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
 		"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
 	devs := parseLspci(input)
 	if len(devs) != 1 {
 		t.Fatalf("expected 1 remaining device, got %d", len(devs))
 	}
 	if devs[0].Model == nil || *devs[0].Model != "H100" {
 		t.Fatalf("unexpected remaining device: %+v", devs[0])
 	}
 }
 func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
 	input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
--- a/audit/internal/collector/storage.go
+++ b/audit/internal/collector/storage.go
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
 		if dev.Type != "disk" {
 			continue
 		}
 		if isVirtualBMCDisk(dev) {
 			slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
 			continue
 		}
 		disks = append(disks, dev)
 	}
 	return disks
 }
 // isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
 // that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
 // These have zero reported size, a generic fake serial, and a model name that
 // starts with "Virtual HDisk".
 func isVirtualBMCDisk(dev lsblkDevice) bool {
 	return isVirtualHDiskModel(dev.Model)
 }
 func isVirtualHDiskModel(model string) bool {
 	model = strings.ToLower(strings.TrimSpace(model))
 	return strings.HasPrefix(model, "virtual hdisk")
 }
 func lsblkDevices() []lsblkDevice {
 	out, err := exec.Command("lsblk", "-J", "-d",
 		"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,141 @@
 package platform
 import (
 	"fmt"
 	"strings"
 	"time"
 )
 func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
 	fmt.Fprintf(&b, "===========================\n\n")
 	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
 	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
 	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
 	if len(result.Findings) > 0 {
 		fmt.Fprintf(&b, "Executive Summary\n")
 		fmt.Fprintf(&b, "-----------------\n")
 		for _, finding := range result.Findings {
 			fmt.Fprintf(&b, "- %s\n", finding)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.Warnings) > 0 {
 		fmt.Fprintf(&b, "Warnings\n")
 		fmt.Fprintf(&b, "--------\n")
 		for _, warning := range result.Warnings {
 			fmt.Fprintf(&b, "- %s\n", warning)
 		}
 		b.WriteString("\n")
 	}
 	fmt.Fprintf(&b, "Per GPU Scorecard\n")
 	fmt.Fprintf(&b, "-----------------\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
 		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
 		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
 		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
 		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
 		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
 		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
 		if gpu.Scores.InterconnectScore > 0 {
 			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
 		}
 		if len(gpu.DegradationReasons) > 0 {
 			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
 		}
 		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
 		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
 		if len(gpu.PrecisionResults) > 0 {
 			fmt.Fprintf(&b, "  Precision results:\n")
 			for _, precision := range gpu.PrecisionResults {
 				if precision.Supported {
 					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
 				} else {
 					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
 				}
 			}
 		}
 		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
 			gpu.Throttle.SWPowerCapUS,
 			gpu.Throttle.SWThermalSlowdownUS,
 			gpu.Throttle.SyncBoostUS,
 			gpu.Throttle.HWThermalSlowdownUS,
 			gpu.Throttle.HWPowerBrakeSlowdownUS,
 		)
 		if len(gpu.Notes) > 0 {
 			fmt.Fprintf(&b, "  Notes:\n")
 			for _, note := range gpu.Notes {
 				fmt.Fprintf(&b, "    - %s\n", note)
 			}
 		}
 		b.WriteString("\n")
 	}
 	if result.Interconnect != nil {
 		fmt.Fprintf(&b, "Interconnect\n")
 		fmt.Fprintf(&b, "------------\n")
 		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
 			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
 			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
 		}
 		for _, note := range result.Interconnect.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
 		b.WriteString("\n")
 	}
 	fmt.Fprintf(&b, "Methodology\n")
 	fmt.Fprintf(&b, "-----------\n")
 	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
 	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
 	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
 	fmt.Fprintf(&b, "Raw Files\n")
 	fmt.Fprintf(&b, "---------\n")
 	fmt.Fprintf(&b, "- result.json\n")
 	fmt.Fprintf(&b, "- report.txt\n")
 	fmt.Fprintf(&b, "- summary.txt\n")
 	fmt.Fprintf(&b, "- verbose.log\n")
 	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
 	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
 	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
 	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
 	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
 	if result.Interconnect != nil {
 		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
 	}
 	return b.String()
 }
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
 	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
 	var best float64
 	for i, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
 		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
 		if i == 0 || gpu.Scores.CompositeScore > best {
 			best = gpu.Scores.CompositeScore
 		}
 	}
 	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
 	if result.Interconnect != nil {
 		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
 		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
 	}
 	return b.String()
 }
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,132 @@
 package platform
 import (
 	"strings"
 	"testing"
 )
 func TestResolveBenchmarkProfile(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name    string
 		profile string
 		want    benchmarkProfileSpec
 	}{
 		{
 			name:    "default",
 			profile: "",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
 		},
 		{
 			name:    "stability",
 			profile: "stability",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
 		},
 		{
 			name:    "overnight",
 			profile: "overnight",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
 		},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got := resolveBenchmarkProfile(tc.profile)
 			if got != tc.want {
 				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
 			}
 		})
 	}
 }
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()
 	raw := strings.Join([]string{
 		"loader=bee-gpu-burn",
 		"[gpu 0] device=NVIDIA H100",
 		"[gpu 0] compute_capability=9.0",
 		"[gpu 0] backend=cublasLt",
 		"[gpu 0] duration_s=10",
 		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
 		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
 		"[gpu 0] fp16_tensor_iterations=200",
 		"[gpu 0] fp8_e4m3_iterations=50",
 		"[gpu 0] status=OK",
 	}, "\n")
 	got := parseBenchmarkBurnLog(raw)
 	if got.Backend != "cublasLt" {
 		t.Fatalf("backend=%q want cublasLt", got.Backend)
 	}
 	if got.ComputeCapability != "9.0" {
 		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
 	}
 	if len(got.Profiles) != 2 {
 		t.Fatalf("profiles=%d want 2", len(got.Profiles))
 	}
 	if got.Profiles[0].TeraOpsPerSec <= 0 {
 		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
 	}
 	if got.Profiles[1].Category != "fp8" {
 		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
 	}
 }
 func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 	t.Parallel()
 	result := NvidiaBenchmarkResult{
 		BenchmarkVersion:   benchmarkVersion,
 		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
 		OverallStatus:      "PARTIAL",
 		SelectedGPUIndices: []int{0},
 		Normalization: BenchmarkNormalization{
 			Status: "partial",
 		},
 		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
 		GPUs: []BenchmarkGPUResult{
 			{
 				Index:  0,
 				Name:   "NVIDIA H100",
 				Status: "OK",
 				Steady: BenchmarkTelemetrySummary{
 					AvgPowerW:           680,
 					AvgTempC:            79,
 					AvgGraphicsClockMHz: 1725,
 					P95PowerW:           700,
 					P95TempC:            82,
 					P95GraphicsClockMHz: 1800,
 				},
 				Scores: BenchmarkScorecard{
 					ComputeScore:        1200,
 					PowerSustainScore:   96,
 					ThermalSustainScore: 88,
 					StabilityScore:      92,
 					CompositeScore:      1176,
 				},
 				PrecisionResults: []BenchmarkPrecisionResult{
 					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
 				},
 				Throttle: BenchmarkThrottleCounters{
 					SWPowerCapUS: 1000000,
 				},
 				DegradationReasons: []string{"power_capped"},
 			},
 		},
 	}
 	report := renderBenchmarkReport(result)
 	for _, needle := range []string{
 		"Executive Summary",
 		"GPU 0 spent measurable time under SW power cap.",
 		"Composite score: 1176.00",
 		"fp16_tensor: 700.00 TOPS",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,132 @@
 package platform
 import "time"
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
 }
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion   string                       `json:"benchmark_version"`
 	GeneratedAt        time.Time                    `json:"generated_at"`
 	Hostname           string                       `json:"hostname,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
 	Warnings           []string                     `json:"warnings,omitempty"`
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 }
 type BenchmarkNormalization struct {
 	Status string                      `json:"status"`
 	Notes  []string                    `json:"notes,omitempty"`
 	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
 }
 type BenchmarkNormalizationGPU struct {
 	Index                 int      `json:"index"`
 	PersistenceMode       string   `json:"persistence_mode,omitempty"`
 	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
 	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
 	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
 	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
 }
 type BenchmarkGPUResult struct {
 	Index                  int                        `json:"index"`
 	UUID                   string                     `json:"uuid,omitempty"`
 	Name                   string                     `json:"name,omitempty"`
 	BusID                  string                     `json:"bus_id,omitempty"`
 	VBIOS                  string                     `json:"vbios,omitempty"`
 	ComputeCapability      string                     `json:"compute_capability,omitempty"`
 	Backend                string                     `json:"backend,omitempty"`
 	Status                 string                     `json:"status"`
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
 	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
 	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
 	Baseline               BenchmarkTelemetrySummary  `json:"baseline"`
 	Steady                 BenchmarkTelemetrySummary  `json:"steady"`
 	Cooldown               BenchmarkTelemetrySummary  `json:"cooldown"`
 	Throttle               BenchmarkThrottleCounters  `json:"throttle_counters"`
 	PrecisionResults       []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
 	Scores                 BenchmarkScorecard         `json:"scores"`
 	DegradationReasons     []string                   `json:"degradation_reasons,omitempty"`
 	Notes                  []string                   `json:"notes,omitempty"`
 }
 type BenchmarkTelemetrySummary struct {
 	DurationSec         float64 `json:"duration_sec"`
 	Samples             int     `json:"samples"`
 	AvgTempC            float64 `json:"avg_temp_c"`
 	P95TempC            float64 `json:"p95_temp_c"`
 	AvgPowerW           float64 `json:"avg_power_w"`
 	P95PowerW           float64 `json:"p95_power_w"`
 	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
 	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
 	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
 	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
 	AvgUsagePct         float64 `json:"avg_usage_pct"`
 	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
 	ClockCVPct          float64 `json:"clock_cv_pct"`
 	PowerCVPct          float64 `json:"power_cv_pct"`
 	TempCVPct           float64 `json:"temp_cv_pct"`
 	ClockDriftPct       float64 `json:"clock_drift_pct"`
 }
 type BenchmarkThrottleCounters struct {
 	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
 	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
 	SyncBoostUS            uint64 `json:"sync_boost_us"`
 	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
 	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
 }
 type BenchmarkPrecisionResult struct {
 	Name          string  `json:"name"`
 	Category      string  `json:"category"`
 	Supported     bool    `json:"supported"`
 	Lanes         int     `json:"lanes,omitempty"`
 	M             uint64  `json:"m,omitempty"`
 	N             uint64  `json:"n,omitempty"`
 	K             uint64  `json:"k,omitempty"`
 	Iterations    uint64  `json:"iterations,omitempty"`
 	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
 	Notes         string  `json:"notes,omitempty"`
 }
 type BenchmarkScorecard struct {
 	ComputeScore        float64 `json:"compute_score"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
 	StabilityScore      float64 `json:"stability_score"`
 	InterconnectScore   float64 `json:"interconnect_score"`
 	CompositeScore      float64 `json:"composite_score"`
 }
 type BenchmarkInterconnectResult struct {
 	Status             string   `json:"status"`
 	Attempted          bool     `json:"attempted"`
 	Supported          bool     `json:"supported"`
 	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
 	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
 	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
 	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
 	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
 	Notes              []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/error_patterns.go
+++ b/audit/internal/platform/error_patterns.go
@@ -0,0 +1,139 @@
 package platform
 import "regexp"
 // ErrorPattern describes a kernel log pattern that indicates a hardware error.
 // Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
 type ErrorPattern struct {
 	// Name is a short machine-readable label for logging and deduplication.
 	Name string
 	// Re is the compiled regular expression matched against a single kmsg line.
 	Re *regexp.Regexp
 	// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
 	Category string
 	// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
 	Severity string
 	// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
 	// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
 	BDFGroup int
 	// DevGroup is the capture group index (1-based) that contains a device name
 	// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
 	DevGroup int
 }
 // HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
 // To add a new pattern: append a new ErrorPattern struct to this slice.
 var HardwareErrorPatterns = []ErrorPattern{
 	// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
 	{
 		Name:     "nvidia-rminitadapter",
 		Re:       mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "nvidia-msi-fail",
 		Re:       mustPat(`(?i)NVRM:.*Failed to enable MSI`),
 		Category: "gpu",
 		Severity: "warning",
 	},
 	{
 		Name:     "nvidia-aer",
 		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "nvidia-xid",
 		Re:       mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	// ── PCIe AER (generic) ──────────────────────────────────────────────────────
 	{
 		Name:     "pcie-aer",
 		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "pcie-uncorrectable",
 		Re:       mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "pcie-link-down",
 		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	// ── Storage ─────────────────────────────────────────────────────────────────
 	{
 		Name:     "blk-io-error",
 		Re:       mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	{
 		Name:     "nvme-timeout",
 		Re:       mustPat(`(?i)nvme\s+(\w+):.*timeout`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	{
 		Name:     "scsi-failed",
 		Re:       mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
 		Category: "storage",
 		Severity: "warning",
 	},
 	{
 		Name:     "nvme-reset",
 		Re:       mustPat(`(?i)nvme\s+(\w+):.*reset`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	// ── Machine Check Exceptions ────────────────────────────────────────────────
 	{
 		Name:     "mce-hardware-error",
 		Re:       mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
 		Category: "mce",
 		Severity: "warning",
 	},
 	{
 		Name:     "mce-corrected",
 		Re:       mustPat(`(?i)mce:.*[Cc]orrected`),
 		Category: "mce",
 		Severity: "warning",
 	},
 	// ── Memory ─────────────────────────────────────────────────────────────────
 	{
 		Name:     "edac-ue",
 		Re:       mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
 		Category: "memory",
 		Severity: "warning",
 	},
 	{
 		Name:     "edac-ce",
 		Re:       mustPat(`(?i)EDAC.*[Cc]orrectable`),
 		Category: "memory",
 		Severity: "warning",
 	},
 }
 func mustPat(s string) *regexp.Regexp {
 	return regexp.MustCompile(s)
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -20,12 +20,13 @@ type GPUMetricRow struct {
 	MemUsagePct float64 `json:"mem_usage_pct"`
 	PowerW      float64 `json:"power_w"`
 	ClockMHz    float64 `json:"clock_mhz"`
 	MemClockMHz float64 `json:"mem_clock_mhz"`
 }
 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	args := []string{
-		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
+		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			continue
 		}
 		parts := strings.Split(line, ", ")
-		if len(parts) < 6 {
+		if len(parts) < 7 {
 			continue
 		}
 		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			MemUsagePct: parseGPUFloat(parts[3]),
 			PowerW:      parseGPUFloat(parts[4]),
 			ClockMHz:    parseGPUFloat(parts[5]),
 			MemClockMHz: parseGPUFloat(parts[6]),
 		})
 	}
 	return rows, nil
@@ -139,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
+	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
+		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
+			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
@@ -197,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	const PW = plotX2 - plotX1
 	const PH = plotY2 - plotY1
 	// Outer axes
-	const tempAxisX = 60  // temp axis line
+	const tempAxisX = 60   // temp axis line
 	const clockAxisX = 900 // clock axis line
 	colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
--- a/audit/internal/platform/install.go
+++ b/audit/internal/platform/install.go
@@ -11,10 +11,10 @@ import (
 // InstallDisk describes a candidate disk for installation.
 type InstallDisk struct {
-	Device      string   // e.g. /dev/sda
+	Device       string // e.g. /dev/sda
-	Model       string
+	Model        string
-	Size        string   // human-readable, e.g. "500G"
+	Size         string   // human-readable, e.g. "500G"
-	SizeBytes   int64    // raw byte count from lsblk
+	SizeBytes    int64    // raw byte count from lsblk
 	MountedParts []string // partition mount points currently active
 }
@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
 	return "/dev/" + strings.TrimSpace(string(out2))
 }
 func mountSource(target string) string {
 	out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
 	if err != nil {
 		return ""
 	}
 	return strings.TrimSpace(string(out))
 }
 func mountFSType(target string) string {
 	out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
 	if err != nil {
 		return ""
 	}
 	return strings.TrimSpace(string(out))
 }
 func blockDeviceType(device string) string {
 	if strings.TrimSpace(device) == "" {
 		return ""
 	}
 	out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
 	if err != nil {
 		return ""
 	}
 	return strings.TrimSpace(string(out))
 }
 func blockDeviceTransport(device string) string {
 	if strings.TrimSpace(device) == "" {
 		return ""
 	}
 	out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
 	if err != nil {
 		return ""
 	}
 	return strings.TrimSpace(string(out))
 }
 func inferLiveBootKind(fsType, source, deviceType, transport string) string {
 	switch {
 	case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
 		return "ram"
 	case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
 		return "cdrom"
 	case strings.EqualFold(strings.TrimSpace(transport), "usb"):
 		return "usb"
 	case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
 		return "cdrom"
 	case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
 		return "disk"
 	default:
 		return "unknown"
 	}
 }
 // MinInstallBytes returns the minimum recommended disk size for installation:
 // squashfs size × 1.5 to allow for extracted filesystem and bootloader.
 // Returns 0 if the squashfs is not available (non-live environment).
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -12,11 +12,40 @@ import (
 )
 func (s *System) IsLiveMediaInRAM() bool {
-	out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
+	fsType := mountFSType("/run/live/medium")
-	if err != nil {
+	if fsType == "" {
 		return toramActive()
 	}
-	return strings.TrimSpace(string(out)) == "tmpfs"
+	return strings.EqualFold(fsType, "tmpfs")
 }
 func (s *System) LiveBootSource() LiveBootSource {
 	fsType := mountFSType("/run/live/medium")
 	source := mountSource("/run/live/medium")
 	device := findLiveBootDevice()
 	status := LiveBootSource{
 		InRAM:  strings.EqualFold(fsType, "tmpfs"),
 		Source: source,
 		Device: device,
 	}
 	if fsType == "" && source == "" && device == "" {
 		if toramActive() {
 			status.InRAM = true
 			status.Kind = "ram"
 			status.Source = "tmpfs"
 			return status
 		}
 		status.Kind = "unknown"
 		return status
 	}
 	status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
 	if status.Kind == "" {
 		status.Kind = "unknown"
 	}
 	if status.InRAM && strings.TrimSpace(status.Source) == "" {
 		status.Source = "tmpfs"
 	}
 	return status
 }
 func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -0,0 +1,28 @@
 package platform
 import "testing"
 func TestInferLiveBootKind(t *testing.T) {
 	tests := []struct {
 		name       string
 		fsType     string
 		source     string
 		deviceType string
 		transport  string
 		want       string
 	}{
 		{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
 		{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
 		{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
 		{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
 		{name: "unknown", source: "overlay", want: "unknown"},
 	}
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
 			if got != tc.want {
 				t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
 			}
 		})
 	}
 }
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -0,0 +1,64 @@
 package platform
 import (
 	"fmt"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
 )
 // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
 // bee test worker processes that should be killed by KillTestWorkers.
 var workerPatterns = []string{
 	"bee-gpu-burn",
 	"stress-ng",
 	"stressapptest",
 	"memtester",
 }
 // KilledProcess describes a process that was sent SIGKILL.
 type KilledProcess struct {
 	PID  int    `json:"pid"`
 	Name string `json:"name"`
 }
 // KillTestWorkers scans /proc for running test worker processes and sends
 // SIGKILL to each one found. It returns a list of killed processes.
 // Errors for individual processes (e.g. already exited) are silently ignored.
 func KillTestWorkers() []KilledProcess {
 	entries, err := os.ReadDir("/proc")
 	if err != nil {
 		return nil
 	}
 	var killed []KilledProcess
 	for _, e := range entries {
 		if !e.IsDir() {
 			continue
 		}
 		pid, err := strconv.Atoi(e.Name())
 		if err != nil {
 			continue
 		}
 		cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
 		if err != nil {
 			continue
 		}
 		// /proc/*/cmdline uses NUL bytes as argument separators.
 		args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
 		exe := strings.TrimSpace(args[0])
 		base := exe
 		if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
 			base = exe[idx+1:]
 		}
 		for _, pat := range workerPatterns {
 			if strings.Contains(base, pat) || strings.Contains(exe, pat) {
 				_ = syscall.Kill(pid, syscall.SIGKILL)
 				killed = append(killed, KilledProcess{PID: pid, Name: base})
 				break
 			}
 		}
 	}
 	return killed
 }
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -68,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample {
 // sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
 // the overall CPU utilisation percentage.
 var cpuStatPrev [2]uint64 // [total, idle]
 func sampleCPULoadPct() float64 {
-	total, idle := readCPUStat()
+	total0, idle0 := readCPUStat()
-	if total == 0 {
+	if total0 == 0 {
 		return 0
 	}
-	prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
+	time.Sleep(200 * time.Millisecond)
-	cpuStatPrev = [2]uint64{total, idle}
+	total1, idle1 := readCPUStat()
-	if prevTotal == 0 {
+	if total1 == 0 {
 		return 0
 	}
 	return cpuLoadPctBetween(total0, idle0, total1, idle1)
 }
 func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
 	dt := float64(total - prevTotal)
 	di := float64(idle - prevIdle)
 	if dt <= 0 {
--- a/audit/internal/platform/live_metrics_test.go
+++ b/audit/internal/platform/live_metrics_test.go
@@ -42,3 +42,53 @@ func TestCompactAmbientTempName(t *testing.T) {
 		t.Fatalf("got %q", got)
 	}
 }
 func TestCPULoadPctBetween(t *testing.T) {
 	tests := []struct {
 		name      string
 		prevTotal uint64
 		prevIdle  uint64
 		total     uint64
 		idle      uint64
 		want      float64
 	}{
 		{
 			name:      "busy half",
 			prevTotal: 100,
 			prevIdle:  40,
 			total:     200,
 			idle:      90,
 			want:      50,
 		},
 		{
 			name:      "fully busy",
 			prevTotal: 100,
 			prevIdle:  40,
 			total:     200,
 			idle:      40,
 			want:      100,
 		},
 		{
 			name:      "no progress",
 			prevTotal: 100,
 			prevIdle:  40,
 			total:     100,
 			idle:      40,
 			want:      0,
 		},
 		{
 			name:      "idle delta larger than total clamps to zero",
 			prevTotal: 100,
 			prevIdle:  40,
 			total:     200,
 			idle:      150,
 			want:      0,
 		},
 	}
 	for _, tc := range tests {
 		if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
 			t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
 		}
 	}
 }
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -95,9 +95,7 @@ func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
 	if opts.DurationSec <= 0 {
 		opts.DurationSec = 300
 	}
-	if opts.SizeMB <= 0 {
+	// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
 		opts.SizeMB = 64
 	}
 	switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
 	case "", NvidiaStressLoaderBuiltin:
 		opts.Loader = NvidiaStressLoaderBuiltin
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -26,7 +26,8 @@ type PlatformStressCycle struct {
 // PlatformStressOptions controls the thermal cycling test.
 type PlatformStressOptions struct {
-	Cycles []PlatformStressCycle
+	Cycles     []PlatformStressCycle
 	Components []string // if empty: run all; values: "cpu", "gpu"
 }
 // platformStressRow is one second of telemetry.
@@ -68,8 +69,11 @@ func (s *System) RunPlatformStress(
 		return "", fmt.Errorf("mkdir run dir: %w", err)
 	}
 	hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
 	hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
 	vendor := s.DetectGPUVendor()
-	logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
+	logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
 	var rows []platformStressRow
 	start := time.Now()
@@ -88,27 +92,31 @@ func (s *System) RunPlatformStress(
 		var wg sync.WaitGroup
 		// CPU stress
-		wg.Add(1)
+		if hasCPU {
-		go func() {
+			wg.Add(1)
-			defer wg.Done()
+			go func() {
-			cpuCmd, err := buildCPUStressCmd(loadCtx)
+				defer wg.Done()
-			if err != nil {
+				cpuCmd, err := buildCPUStressCmd(loadCtx)
-				logFunc("CPU stress: " + err.Error())
+				if err != nil {
-				return
+					logFunc("CPU stress: " + err.Error())
-			}
+					return
-			_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
+				}
-		}()
+				_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
 			}()
 		}
 		// GPU stress
-		wg.Add(1)
+		if hasGPU {
-		go func() {
+			wg.Add(1)
-			defer wg.Done()
+			go func() {
-			gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+				defer wg.Done()
-			if gpuCmd == nil {
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor)
-				return
+				if gpuCmd == nil {
-			}
+					return
-			_ = gpuCmd.Wait()
+				}
-		}()
+				_ = gpuCmd.Wait()
 			}()
 		}
 		// Monitoring goroutine for load phase
 		loadRows := collectPhase(loadCtx, cycleNum, "load", start)
@@ -439,7 +447,7 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
 	if err != nil {
 		return nil
 	}
-	cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
+	cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
@@ -486,6 +494,15 @@ func platformStressMemoryMB() int {
 	return mb
 }
 func containsComponent(components []string, name string) bool {
 	for _, c := range components {
 		if c == name {
 			return true
 		}
 	}
 	return false
 }
 func packPlatformDir(dir, dest string) error {
 	f, err := os.Create(dest)
 	if err != nil {
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -12,6 +12,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
 	"syscall"
 	"sort"
 	"strconv"
 	"strings"
@@ -285,7 +286,25 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
 // gpuIndices: specific GPU indices to test (empty = all GPUs).
 // ctx cancellation kills the running job.
 func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
+	resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
 }
 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	if len(gpuIndices) > 0 {
 		return dedupeSortedIndices(gpuIndices), nil
 	}
 	all, err := listNvidiaGPUIndices()
 	if err != nil {
 		return nil, err
 	}
 	if len(all) == 0 {
 		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
 	}
 	return all, nil
 }
 func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -531,6 +550,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
 	}
 	c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
 	c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	c.Cancel = func() error {
 		if c.Process != nil {
 			_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
 		}
 		return nil
 	}
 	if len(env) > 0 {
 		c.Env = append(os.Environ(), env...)
 	}
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -51,6 +51,18 @@ type FanStressRow struct {
 	SysPowerW    float64 // DCMI system power reading
 }
 type cachedPowerReading struct {
 	Value     float64
 	UpdatedAt time.Time
 }
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
 )
 const systemPowerHoldTTL = 15 * time.Second
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -508,11 +520,17 @@ func sampleCPUTempViaSensors() float64 {
 // sampleSystemPower reads system power draw via DCMI.
 func sampleSystemPower() float64 {
 	now := time.Now()
 	current := 0.0
 	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
-	if err != nil {
+	if err == nil {
-		return 0
+		current = parseDCMIPowerReading(string(out))
 	}
-	return parseDCMIPowerReading(string(out))
+	systemPowerCacheMu.Lock()
 	defer systemPowerCacheMu.Unlock()
 	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
 	systemPowerCache = updated
 	return value
 }
 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -535,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }
 func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
 	if current > 0 {
 		cache = cachedPowerReading{Value: current, UpdatedAt: now}
 		return current, cache
 	}
 	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
 		return cache.Value, cache
 	}
 	return 0, cache
 }
 // analyzeThrottling returns true if any GPU reported an active throttle reason
 // during either load phase.
 func analyzeThrottling(rows []FanStressRow) bool {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,9 @@
 package platform
-import "testing"
+import (
 	"testing"
 	"time"
 )
 func TestParseFanSpeeds(t *testing.T) {
 	raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
@@ -25,3 +28,40 @@ func TestFirstFanInputValue(t *testing.T) {
 		t.Fatalf("got=%v ok=%v", got, ok)
 	}
 }
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
 Minimum during sampling period:               498 Watts
 `
 	if got := parseDCMIPowerReading(raw); got != 512 {
 		t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
 	}
 }
 func TestEffectiveSystemPowerReading(t *testing.T) {
 	now := time.Now()
 	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
 	got, updated := effectiveSystemPowerReading(cache, 0, now)
 	if got != 480 {
 		t.Fatalf("got=%v want cached 480", got)
 	}
 	if updated.Value != 480 {
 		t.Fatalf("updated=%+v", updated)
 	}
 	got, updated = effectiveSystemPowerReading(cache, 530, now)
 	if got != 530 {
 		t.Fatalf("got=%v want 530", got)
 	}
 	if updated.Value != 530 {
 		t.Fatalf("updated=%+v", updated)
 	}
 	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
 	got, _ = effectiveSystemPowerReading(expired, 0, now)
 	if got != 0 {
 		t.Fatalf("expired cache returned %v want 0", got)
 	}
 }
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -162,6 +162,39 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
 	}
 }
 func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
 	t.Parallel()
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	got, err := resolveDCGMGPUIndices(nil)
 	if err != nil {
 		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
 	}
 	if want := "0,1,2"; joinIndexList(got) != want {
 		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
 	}
 }
 func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	t.Parallel()
 	got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
 	if err != nil {
 		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
 	}
 	if want := "1,3"; joinIndexList(got) != want {
 		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
 	}
 }
 func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -2,6 +2,13 @@ package platform
 type System struct{}
 type LiveBootSource struct {
 	InRAM  bool   `json:"in_ram"`
 	Kind   string `json:"kind"`
 	Source string `json:"source,omitempty"`
 	Device string `json:"device,omitempty"`
 }
 type InterfaceInfo struct {
 	Name  string
 	State string
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -63,6 +63,10 @@ func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
 	if !sseStart(w) {
 		return
 	}
 	streamSubscribedJob(w, r, j)
 }
 func streamSubscribedJob(w http.ResponseWriter, r *http.Request, j *jobState) {
 	existing, ch := j.subscribe()
 	for _, line := range existing {
 		sseWrite(w, "", line)
@@ -106,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
 	scanDone := make(chan error, 1)
 	go func() {
 		defer func() {
 			if rec := recover(); rec != nil {
 				scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
 			}
 		}()
 		scanner := bufio.NewScanner(pr)
 		scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
 		for scanner.Scan() {
@@ -181,13 +190,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		}
 		var body struct {
-			Duration          int    `json:"duration"`
+			Duration           int      `json:"duration"`
-			DiagLevel         int    `json:"diag_level"`
+			DiagLevel          int      `json:"diag_level"`
-			GPUIndices        []int  `json:"gpu_indices"`
+			GPUIndices         []int    `json:"gpu_indices"`
-			ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
+			ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
-			Loader            string `json:"loader"`
+			Loader             string   `json:"loader"`
-			Profile           string `json:"profile"`
+			Profile            string   `json:"profile"`
-			DisplayName       string `json:"display_name"`
+			DisplayName        string   `json:"display_name"`
 			PlatformComponents []string `json:"platform_components"`
 		}
 		if r.Body != nil {
 			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
@@ -204,13 +214,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 			Status:    TaskPending,
 			CreatedAt: time.Now(),
 			params: taskParams{
-				Duration:          body.Duration,
+				Duration:           body.Duration,
-				DiagLevel:         body.DiagLevel,
+				DiagLevel:          body.DiagLevel,
-				GPUIndices:        body.GPUIndices,
+				GPUIndices:         body.GPUIndices,
-				ExcludeGPUIndices: body.ExcludeGPUIndices,
+				ExcludeGPUIndices:  body.ExcludeGPUIndices,
-				Loader:            body.Loader,
+				Loader:             body.Loader,
-				BurnProfile:       body.Profile,
+				BurnProfile:        body.Profile,
-				DisplayName:       body.DisplayName,
+				DisplayName:        body.DisplayName,
 				PlatformComponents: body.PlatformComponents,
 			},
 		}
 		if strings.TrimSpace(body.DisplayName) != "" {
@@ -344,8 +355,10 @@ func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request)
 		return
 	}
 	writeJSON(w, map[string]any{
-		"interfaces":    ifaces,
+		"interfaces":     ifaces,
-		"default_route": h.opts.App.DefaultRoute(),
+		"default_route":  h.opts.App.DefaultRoute(),
 		"pending_change": h.hasPendingNetworkChange(),
 		"rollback_in":    h.pendingNetworkRollbackIn(),
 	})
 }
@@ -424,27 +437,6 @@ func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
 	writeJSON(w, entries)
 }
 func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) {
 	if globalQueue.hasActiveTarget("support-bundle") {
 		writeError(w, http.StatusConflict, "support bundle task is already pending or running")
 		return
 	}
 	t := &Task{
 		ID:        newJobID("support-bundle"),
 		Name:      "Support Bundle",
 		Target:    "support-bundle",
 		Status:    TaskPending,
 		CreatedAt: time.Now(),
 	}
 	globalQueue.enqueue(t)
 	writeJSON(w, map[string]string{
 		"status":  "queued",
 		"task_id": t.ID,
 		"job_id":  t.ID,
 		"url":     "/export/support.tar.gz",
 	})
 }
 func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -512,6 +504,26 @@ func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
 	})
 }
 // ── GPU tools ─────────────────────────────────────────────────────────────────
 func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
 	type toolEntry struct {
 		ID        string `json:"id"`
 		Available bool   `json:"available"`
 		Vendor    string `json:"vendor"` // "nvidia" | "amd"
 	}
 	_, nvidiaErr := os.Stat("/dev/nvidia0")
 	_, amdErr := os.Stat("/dev/kfd")
 	nvidiaUp := nvidiaErr == nil
 	amdUp := amdErr == nil
 	writeJSON(w, []toolEntry{
 		{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
 		{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
 		{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
 		{ID: "rvs", Available: amdUp, Vendor: "amd"},
 	})
 }
 // ── System ────────────────────────────────────────────────────────────────────
 func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -519,9 +531,9 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
-	inRAM := h.opts.App.IsLiveMediaInRAM()
+	status := h.opts.App.LiveBootSource()
 	w.Header().Set("Content-Type", "application/json")
-	_ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM})
+	_ = json.NewEncoder(w).Encode(status)
 }
 func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
@@ -722,13 +734,7 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
 	h.ringMemLoad.push(sample.MemLoadPct)
 	h.ringsMu.Lock()
-	for i, fan := range sample.Fans {
+	h.pushFanRings(sample.Fans)
 		for len(h.ringFans) <= i {
 			h.ringFans = append(h.ringFans, newMetricsRing(120))
 			h.fanNames = append(h.fanNames, fan.Name)
 		}
 		h.ringFans[i].push(float64(fan.RPM))
 	}
 	for _, gpu := range sample.GPUs {
 		idx := gpu.GPUIndex
 		for len(h.gpuRings) <= idx {
@@ -747,6 +753,51 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
 	h.ringsMu.Unlock()
 }
 func (h *handler) pushFanRings(fans []platform.FanReading) {
 	if len(fans) == 0 && len(h.ringFans) == 0 {
 		return
 	}
 	fanValues := make(map[string]float64, len(fans))
 	for _, fan := range fans {
 		if fan.Name == "" {
 			continue
 		}
 		fanValues[fan.Name] = fan.RPM
 		found := false
 		for i, name := range h.fanNames {
 			if name == fan.Name {
 				found = true
 				if i >= len(h.ringFans) {
 					h.ringFans = append(h.ringFans, newMetricsRing(120))
 				}
 				break
 			}
 		}
 		if !found {
 			h.fanNames = append(h.fanNames, fan.Name)
 			h.ringFans = append(h.ringFans, newMetricsRing(120))
 		}
 	}
 	for i, ring := range h.ringFans {
 		if ring == nil {
 			continue
 		}
 		name := ""
 		if i < len(h.fanNames) {
 			name = h.fanNames[i]
 		}
 		if rpm, ok := fanValues[name]; ok {
 			ring.push(rpm)
 			continue
 		}
 		if last, ok := ring.latest(); ok {
 			ring.push(last)
 			continue
 		}
 		ring.push(0)
 	}
 }
 func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
 	if name == "" {
 		return
@@ -825,7 +876,10 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
 		return result, err
 	}
-	pnc := &pendingNetChange{snapshot: snapshot}
+	pnc := &pendingNetChange{
 		snapshot: snapshot,
 		deadline: time.Now().Add(netRollbackTimeout),
 	}
 	pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
 		_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
 		h.pendingNetMu.Lock()
@@ -842,6 +896,25 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
 	return result, nil
 }
 func (h *handler) hasPendingNetworkChange() bool {
 	h.pendingNetMu.Lock()
 	defer h.pendingNetMu.Unlock()
 	return h.pendingNet != nil
 }
 func (h *handler) pendingNetworkRollbackIn() int {
 	h.pendingNetMu.Lock()
 	defer h.pendingNetMu.Unlock()
 	if h.pendingNet == nil {
 		return 0
 	}
 	remaining := int(time.Until(h.pendingNet.deadline).Seconds())
 	if remaining < 1 {
 		return 1
 	}
 	return remaining
 }
 func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
 	h.pendingNetMu.Lock()
 	pnc := h.pendingNet
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -1,12 +1,12 @@
 package webui
 import (
 	"encoding/json"
 	"net/http/httptest"
 	"strings"
 	"testing"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
@@ -64,39 +64,29 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	}
 }
-func TestHandleAPIExportBundleQueuesTask(t *testing.T) {
+
-	globalQueue.mu.Lock()
+func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
-	originalTasks := globalQueue.tasks
+	h := &handler{}
-	globalQueue.tasks = nil
+	h.pushFanRings([]platform.FanReading{
-	globalQueue.mu.Unlock()
+		{Name: "FAN_A", RPM: 4200},
-	t.Cleanup(func() {
+		{Name: "FAN_B", RPM: 5100},
-		globalQueue.mu.Lock()
+	})
-		globalQueue.tasks = originalTasks
+	h.pushFanRings([]platform.FanReading{
-		globalQueue.mu.Unlock()
+		{Name: "FAN_B", RPM: 5200},
 	})
-	h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
+	if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
-	req := httptest.NewRequest("POST", "/api/export/bundle", nil)
+		t.Fatalf("fanNames=%v", h.fanNames)
 	rec := httptest.NewRecorder()
 	h.handleAPIExportBundle(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
-	var body map[string]string
+	aVals, _ := h.ringFans[0].snapshot()
-	if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
+	bVals, _ := h.ringFans[1].snapshot()
-		t.Fatalf("decode response: %v", err)
+	if len(aVals) != 2 || len(bVals) != 2 {
 		t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
 	}
-	if body["task_id"] == "" {
+	if aVals[1] != 4200 {
-		t.Fatalf("missing task_id in response: %v", body)
+		t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
 	}
-	globalQueue.mu.Lock()
+	if bVals[1] != 5200 {
-	defer globalQueue.mu.Unlock()
+		t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
 	if len(globalQueue.tasks) != 1 {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	if got := globalQueue.tasks[0].Target; got != "support-bundle" {
 		t.Fatalf("target=%q want support-bundle", got)
 	}
 }
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -84,12 +84,12 @@ func (m *jobManager) create(id string) *jobState {
 	j := &jobState{}
 	m.jobs[id] = j
 	// Schedule cleanup after 30 minutes
-	go func() {
+	goRecoverOnce("job cleanup", func() {
 		time.Sleep(30 * time.Minute)
 		m.mu.Lock()
 		delete(m.jobs, id)
 		m.mu.Unlock()
-	}()
+	})
 	return j
 }
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -0,0 +1,241 @@
 package webui
 import (
 	"bufio"
 	"io"
 	"log/slog"
 	"os"
 	"strings"
 	"sync"
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 // kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
 // It supports multiple concurrent SAT tasks: a shared event window is open
 // while any SAT task is running, and flushed when all tasks complete.
 type kmsgWatcher struct {
 	mu          sync.Mutex
 	activeCount int // number of in-flight SAT tasks
 	window      *kmsgWindow
 	statusDB    *app.ComponentStatusDB
 }
 type kmsgWindow struct {
 	targets   []string // SAT targets running concurrently
 	startedAt time.Time
 	seen      map[kmsgEventKey]bool
 	events    []kmsgEvent
 }
 type kmsgEventKey struct {
 	id       string // BDF or device name
 	category string
 }
 type kmsgEvent struct {
 	timestamp time.Time
 	raw       string
 	ids       []string // BDF addresses or device names extracted
 	category  string
 }
 func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
 	return &kmsgWatcher{statusDB: statusDB}
 }
 // start launches the background kmsg reading goroutine.
 func (w *kmsgWatcher) start() {
 	goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
 }
 func (w *kmsgWatcher) run() {
 	for {
 		f, err := os.Open("/dev/kmsg")
 		if err != nil {
 			slog.Warn("kmsg watcher unavailable", "err", err)
 			time.Sleep(30 * time.Second)
 			continue
 		}
 		// Best-effort seek to end so we only capture events from now forward.
 		_, _ = f.Seek(0, io.SeekEnd)
 		scanner := bufio.NewScanner(f)
 		scanner.Buffer(make([]byte, 64*1024), 64*1024)
 		for scanner.Scan() {
 			line := scanner.Text()
 			evt, ok := parseKmsgLine(line)
 			if !ok {
 				continue
 			}
 			w.mu.Lock()
 			if w.window != nil {
 				w.recordEvent(evt)
 			}
 			w.mu.Unlock()
 		}
 		if err := scanner.Err(); err != nil {
 			slog.Warn("kmsg watcher stopped", "err", err)
 		}
 		_ = f.Close()
 		time.Sleep(2 * time.Second)
 	}
 }
 // recordEvent appends evt to the active window, deduplicating by (id, category).
 // Must be called with w.mu held.
 func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
 	if len(evt.ids) == 0 {
 		key := kmsgEventKey{id: "", category: evt.category}
 		if !w.window.seen[key] {
 			w.window.seen[key] = true
 			w.window.events = append(w.window.events, evt)
 		}
 		return
 	}
 	for _, id := range evt.ids {
 		key := kmsgEventKey{id: id, category: evt.category}
 		if !w.window.seen[key] {
 			w.window.seen[key] = true
 			w.window.events = append(w.window.events, evt)
 		}
 	}
 }
 // NotifyTaskStarted increments the active task counter and opens a shared event window
 // if this is the first task starting.
 func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	if w.activeCount == 0 {
 		w.window = &kmsgWindow{
 			startedAt: time.Now(),
 			seen:      make(map[kmsgEventKey]bool),
 		}
 	}
 	w.activeCount++
 	if w.window != nil {
 		w.window.targets = append(w.window.targets, target)
 	}
 }
 // NotifyTaskFinished decrements the active task counter. When all tasks finish,
 // it flushes the accumulated events to the status DB.
 func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
 	w.mu.Lock()
 	w.activeCount--
 	var window *kmsgWindow
 	if w.activeCount <= 0 {
 		w.activeCount = 0
 		window = w.window
 		w.window = nil
 	}
 	w.mu.Unlock()
 	if window == nil || len(window.events) == 0 {
 		return
 	}
 	goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
 }
 func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
 	if w.statusDB == nil {
 		return
 	}
 	source := "watchdog:kmsg"
 	// Collect unique component keys from events.
 	seen := map[string]string{} // componentKey → first raw line
 	for _, evt := range window.events {
 		if len(evt.ids) == 0 {
 			// MCE or un-identified error.
 			key := "cpu:all"
 			if evt.category == "memory" {
 				key = "memory:all"
 			}
 			if _, exists := seen[key]; !exists {
 				seen[key] = evt.raw
 			}
 			continue
 		}
 		for _, id := range evt.ids {
 			var key string
 			switch evt.category {
 			case "gpu", "pcie":
 				key = "pcie:" + normalizeBDF(id)
 			case "storage":
 				key = "storage:" + id
 			default:
 				key = "pcie:" + normalizeBDF(id)
 			}
 			if _, exists := seen[key]; !exists {
 				seen[key] = evt.raw
 			}
 		}
 	}
 	for key, detail := range seen {
 		detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
 		w.statusDB.Record(key, source, "Warning", detail)
 	}
 }
 // parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
 // any pattern in platform.HardwareErrorPatterns.
 // kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
 func parseKmsgLine(raw string) (kmsgEvent, bool) {
 	msg := raw
 	if idx := strings.Index(raw, ";"); idx >= 0 {
 		msg = strings.TrimSpace(raw[idx+1:])
 	}
 	if msg == "" {
 		return kmsgEvent{}, false
 	}
 	for _, p := range platform.HardwareErrorPatterns {
 		m := p.Re.FindStringSubmatch(msg)
 		if m == nil {
 			continue
 		}
 		evt := kmsgEvent{
 			timestamp: time.Now(),
 			raw:       msg,
 			category:  p.Category,
 		}
 		if p.BDFGroup > 0 && p.BDFGroup < len(m) {
 			evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
 		}
 		if p.DevGroup > 0 && p.DevGroup < len(m) {
 			evt.ids = append(evt.ids, m[p.DevGroup])
 		}
 		return evt, true
 	}
 	return kmsgEvent{}, false
 }
 // normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
 func normalizeBDF(bdf string) string {
 	bdf = strings.ToLower(strings.TrimSpace(bdf))
 	if strings.Count(bdf, ":") == 1 {
 		return "0000:" + bdf
 	}
 	return bdf
 }
 func truncate(s string, max int) string {
 	if len(s) <= max {
 		return s
 	}
 	return s[:max] + "..."
 }
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
 	case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
 		return true
 	}
 	return false
 }
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -6,7 +6,9 @@ import (
 	"io"
 	"os"
 	"path/filepath"
 	"sort"
 	"strconv"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
@@ -53,6 +55,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
  clock_mhz     REAL,
  mem_clock_mhz REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE IF NOT EXISTS fan_metrics (
@@ -69,6 +73,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
  PRIMARY KEY (ts, name)
 );
 `)
 	if err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
 	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
 }
 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
 	rows, err := db.Query("PRAGMA table_info(" + table + ")")
 	if err != nil {
 		return err
 	}
 	defer rows.Close()
 	for rows.Next() {
 		var cid int
 		var name, ctype string
 		var notNull, pk int
 		var dflt sql.NullString
 		if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
 			return err
 		}
 		if strings.EqualFold(name, column) {
 			return nil
 		}
 	}
 	if err := rows.Err(); err != nil {
 		return err
 	}
 	_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
 	return err
 }
@@ -90,8 +126,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	}
 	for _, g := range s.GPUs {
 		_, err = tx.Exec(
-			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
+			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
-			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
+			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
 		)
 		if err != nil {
 			return err
@@ -120,7 +156,7 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
 }
 // LoadAll returns all persisted samples in chronological order (oldest first).
@@ -151,11 +187,6 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	if len(sysRows) == 0 {
 		return nil, nil
 	}
 	// Reverse to chronological order
 	for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
 		sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
 	}
 	// Collect min/max ts for range query
 	minTS := sysRows[0].ts
 	maxTS := sysRows[len(sysRows)-1].ts
@@ -167,7 +198,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	}
 	gpuData := map[gpuKey]platform.GPUMetricRow{}
 	gRows, err := m.db.Query(
-		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
+		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
 		minTS, maxTS,
 	)
 	if err == nil {
@@ -175,7 +206,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 		for gRows.Next() {
 			var ts int64
 			var g platform.GPUMetricRow
-			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
+			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
 				gpuData[gpuKey{ts, g.GPUIndex}] = g
 			}
 		}
@@ -222,7 +253,9 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 		}
 	}
-	// Collect unique GPU indices and fan names from loaded data (preserve order)
+	// Collect unique GPU indices and fan/temp names from loaded data.
 	// Sort each list so that sample reconstruction is deterministic regardless
 	// of Go's non-deterministic map iteration order.
 	seenGPU := map[int]bool{}
 	var gpuIndices []int
 	for k := range gpuData {
@@ -231,6 +264,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 			gpuIndices = append(gpuIndices, k.idx)
 		}
 	}
 	sort.Ints(gpuIndices)
 	seenFan := map[string]bool{}
 	var fanNames []string
 	for k := range fanData {
@@ -239,6 +274,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 			fanNames = append(fanNames, k.name)
 		}
 	}
 	sort.Strings(fanNames)
 	seenTemp := map[string]bool{}
 	var tempNames []string
 	for k := range tempData {
@@ -247,6 +284,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 			tempNames = append(tempNames, k.name)
 		}
 	}
 	sort.Strings(tempNames)
 	samples := make([]platform.LiveMetricSample, len(sysRows))
 	for i, r := range sysRows {
@@ -280,7 +318,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	rows, err := m.db.Query(`
 		SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
-		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
+		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
 		       g.clock_mhz, g.mem_clock_mhz
 		FROM sys_metrics s
 		LEFT JOIN gpu_metrics g ON g.ts = s.ts
 		ORDER BY s.ts, g.gpu_index
@@ -291,13 +330,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	defer rows.Close()
 	cw := csv.NewWriter(w)
-	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
+	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
 	for rows.Next() {
 		var ts int64
 		var cpu, mem, pwr float64
 		var gpuIdx sql.NullInt64
-		var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
+		var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
-		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
+		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
 			continue
 		}
 		row := []string{
@@ -313,9 +352,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 				strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
 			)
 		} else {
-			row = append(row, "", "", "", "", "")
+			row = append(row, "", "", "", "", "", "", "")
 		}
 		_ = cw.Write(row)
 	}
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -0,0 +1,145 @@
 package webui
 import (
 	"database/sql"
 	"path/filepath"
 	"testing"
 	"time"
 	"bee/audit/internal/platform"
 	_ "modernc.org/sqlite"
 )
 func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
 	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	defer db.Close()
 	base := time.Unix(1_700_000_000, 0).UTC()
 	for i := 0; i < 3; i++ {
 		err := db.Write(platform.LiveMetricSample{
 			Timestamp:  base.Add(time.Duration(i) * time.Second),
 			CPULoadPct: float64(10 + i),
 			MemLoadPct: float64(20 + i),
 			PowerW:     float64(300 + i),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, PowerW: float64(100 + i)},
 				{GPUIndex: 2, PowerW: float64(200 + i)},
 			},
 		})
 		if err != nil {
 			t.Fatalf("Write(%d): %v", i, err)
 		}
 	}
 	all, err := db.LoadAll()
 	if err != nil {
 		t.Fatalf("LoadAll: %v", err)
 	}
 	if len(all) != 3 {
 		t.Fatalf("LoadAll len=%d want 3", len(all))
 	}
 	for i, sample := range all {
 		if len(sample.GPUs) != 2 {
 			t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
 		}
 		if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
 			t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
 		}
 		if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
 			t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
 		}
 	}
 	recent, err := db.LoadRecent(2)
 	if err != nil {
 		t.Fatalf("LoadRecent: %v", err)
 	}
 	if len(recent) != 2 {
 		t.Fatalf("LoadRecent len=%d want 2", len(recent))
 	}
 	if !recent[0].Timestamp.Before(recent[1].Timestamp) {
 		t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
 	}
 	for i, sample := range recent {
 		if len(sample.GPUs) != 2 {
 			t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
 		}
 	}
 }
 func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "metrics.db")
 	raw, err := sql.Open("sqlite", path)
 	if err != nil {
 		t.Fatalf("sql.Open: %v", err)
 	}
 	_, err = raw.Exec(`
 CREATE TABLE gpu_metrics (
  ts            INTEGER NOT NULL,
  gpu_index     INTEGER NOT NULL,
  temp_c        REAL,
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE sys_metrics (
  ts           INTEGER NOT NULL,
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
  PRIMARY KEY (ts)
 );
 CREATE TABLE fan_metrics (
  ts   INTEGER NOT NULL,
  name TEXT NOT NULL,
  rpm  REAL,
  PRIMARY KEY (ts, name)
 );
 CREATE TABLE temp_metrics (
  ts      INTEGER NOT NULL,
  name    TEXT NOT NULL,
  grp     TEXT NOT NULL,
  celsius REAL,
  PRIMARY KEY (ts, name)
 );
 `)
 	if err != nil {
 		t.Fatalf("create legacy schema: %v", err)
 	}
 	_ = raw.Close()
 	db, err := openMetricsDB(path)
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	defer db.Close()
 	now := time.Unix(1_700_000_100, 0).UTC()
 	err = db.Write(platform.LiveMetricSample{
 		Timestamp: now,
 		GPUs: []platform.GPUMetricRow{
 			{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
 		},
 	})
 	if err != nil {
 		t.Fatalf("Write: %v", err)
 	}
 	samples, err := db.LoadAll()
 	if err != nil {
 		t.Fatalf("LoadAll: %v", err)
 	}
 	if len(samples) != 1 || len(samples[0].GPUs) != 1 {
 		t.Fatalf("samples=%+v", samples)
 	}
 	if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
 		t.Fatalf("ClockMHz=%v want 1410", got)
 	}
 	if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
 		t.Fatalf("MemClockMHz=%v want 2600", got)
 	}
 }
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -34,6 +34,49 @@ func TestChartLegendNumber(t *testing.T) {
 	}
 }
 func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
 	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		panic("boom")
 	}))
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/panic", nil)
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusInternalServerError {
 		t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
 	}
 	if !strings.Contains(rec.Body.String(), "internal server error") {
 		t.Fatalf("body=%q", rec.Body.String())
 	}
 }
 func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
 	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if !sseStart(w) {
 			return
 		}
 		if !sseWrite(w, "tick", "ok") {
 			t.Fatal("expected sse write to succeed")
 		}
 	}))
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/stream", nil)
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
 		t.Fatalf("content-type=%q", got)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
 		t.Fatalf("body=%q", body)
 	}
 }
 func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
@@ -89,6 +132,242 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	}
 }
 func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
 			Timestamp: time.Now().Add(-2 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 7, PowerW: 170},
 				{GPUIndex: 2, PowerW: 120},
 				{GPUIndex: 0, PowerW: 100},
 			},
 		},
 		{
 			Timestamp: time.Now().Add(-1 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, PowerW: 101},
 				{GPUIndex: 7, PowerW: 171},
 				{GPUIndex: 2, PowerW: 121},
 			},
 		},
 	}
 	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
 	if title != "GPU Power" {
 		t.Fatalf("title=%q", title)
 	}
 	wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
 	if len(names) != len(wantNames) {
 		t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
 	}
 	for i := range wantNames {
 		if names[i] != wantNames[i] {
 			t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
 		}
 	}
 	if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
 		t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
 	}
 	if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
 		t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
 	}
 	if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
 		t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
 	}
 }
 func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
 			Timestamp: time.Now().Add(-2 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, ClockMHz: 1400, MemClockMHz: 2600},
 				{GPUIndex: 3, ClockMHz: 1500, MemClockMHz: 2800},
 			},
 		},
 		{
 			Timestamp: time.Now().Add(-1 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2610},
 				{GPUIndex: 3, ClockMHz: 1510, MemClockMHz: 2810},
 			},
 		},
 	}
 	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
 	if title != "GPU Core Clock" {
 		t.Fatalf("title=%q", title)
 	}
 	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
 		t.Fatalf("names=%v", names)
 	}
 	if got := datasets[1][1]; got != 1510 {
 		t.Fatalf("GPU 3 core clock=%v want 1510", got)
 	}
 	datasets, names, _, title, _, _, ok = chartDataFromSamples("gpu-all-memclock", samples)
 	if !ok {
 		t.Fatal("gpu-all-memclock returned ok=false")
 	}
 	if title != "GPU Memory Clock" {
 		t.Fatalf("title=%q", title)
 	}
 	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
 		t.Fatalf("names=%v", names)
 	}
 	if got := datasets[0][0]; got != 2600 {
 		t.Fatalf("GPU 0 memory clock=%v want 2600", got)
 	}
 }
 func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
 	want := []float64{0, 480, 480, 480, 510, 510}
 	if len(got) != len(want) {
 		t.Fatalf("len=%d want %d", len(got), len(want))
 	}
 	for i := range want {
 		if got[i] != want[i] {
 			t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
 		}
 	}
 }
 func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
 	body := renderMetrics()
 	if !strings.Contains(body, "const probe = new Image();") {
 		t.Fatalf("metrics page should preload chart images before swap: %s", body)
 	}
 	if !strings.Contains(body, "el.dataset.loading === '1'") {
 		t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
 	}
 	if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
 		t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
 	}
 	if !strings.Contains(body, `id="gpu-chart-toggle"`) {
 		t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
 	}
 	if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
 		t.Fatalf("metrics page should include GPU core clock chart: %s", body)
 	}
 	if !strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
 		t.Fatalf("metrics page should include GPU memory clock chart: %s", body)
 	}
 	if !strings.Contains(body, `renderGPUOverviewCards(indices)`) {
 		t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
 	}
 }
 func TestChartLegendVisible(t *testing.T) {
 	if !chartLegendVisible(8) {
 		t.Fatal("legend should stay visible for charts with up to 8 series")
 	}
 	if chartLegendVisible(9) {
 		t.Fatal("legend should be hidden for charts with more than 8 series")
 	}
 }
 func TestChartYAxisNumber(t *testing.T) {
 	tests := []struct {
 		in   float64
 		want string
 	}{
 		{in: 999, want: "999"},
 		{in: 1000, want: "1к"},
 		{in: 1370, want: "1,4к"},
 		{in: 1500, want: "1,5к"},
 		{in: 1700, want: "1,7к"},
 		{in: 2000, want: "2к"},
 		{in: 9999, want: "10к"},
 		{in: 10200, want: "10к"},
 		{in: -1500, want: "-1,5к"},
 	}
 	for _, tc := range tests {
 		if got := chartYAxisNumber(tc.in); got != tc.want {
 			t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
 		}
 	}
 }
 func TestChartCanvasHeight(t *testing.T) {
 	if got := chartCanvasHeight(4); got != 360 {
 		t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
 	}
 	if got := chartCanvasHeight(12); got != 288 {
 		t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
 	}
 }
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
 	if len(got) != len(want) {
 		t.Fatalf("len=%d want %d", len(got), len(want))
 	}
 	for i := range want {
 		if got[i] != want[i] {
 			t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
 		}
 	}
 }
 func TestChartYAxisOption(t *testing.T) {
 	min := floatPtr(0)
 	max := floatPtr(100)
 	opt := chartYAxisOption(min, max)
 	if opt.Min != min || opt.Max != max {
 		t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
 	}
 	if opt.LabelCount != 11 {
 		t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
 	}
 	if got := opt.ValueFormatter(1000); got != "1к" {
 		t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
 	}
 }
 func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
 	r1 := newMetricsRing(4)
 	r2 := newMetricsRing(4)
 	r1.push(1000)
 	r1.push(1100)
 	r2.push(1200)
 	r2.push(1300)
 	datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
 	if len(datasets) != 2 {
 		t.Fatalf("datasets=%d want 2", len(datasets))
 	}
 	if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
 		t.Fatalf("names=%v", names)
 	}
 	if len(labels) != 2 {
 		t.Fatalf("labels=%v want 2 entries", labels)
 	}
 	if labels[0] == "" || labels[1] == "" {
 		t.Fatalf("labels should contain timeline values, got %v", labels)
 	}
 }
 func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
 	body := renderNetworkInline()
 	if !strings.Contains(body, "d.pending_change") {
 		t.Fatalf("network UI should read pending network state from API: %s", body)
 	}
 	if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
 		t.Fatalf("network UI should periodically refresh network state: %s", body)
 	}
 	if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
 		t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
 	}
 }
 func TestRootRendersDashboard(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
@@ -101,9 +380,10 @@ func TestRootRendersDashboard(t *testing.T) {
 	}
 	handler := NewHandler(HandlerOptions{
-		Title:     "Bee Hardware Audit",
+		Title:      "Bee Hardware Audit",
-		AuditPath: path,
+		BuildLabel: "1.2.3",
-		ExportDir: exportDir,
+		AuditPath:  path,
 		ExportDir:  exportDir,
 	})
 	first := httptest.NewRecorder()
@@ -118,6 +398,11 @@ func TestRootRendersDashboard(t *testing.T) {
 	if !strings.Contains(first.Body.String(), `/viewer`) {
 		t.Fatalf("first body missing viewer link: %s", first.Body.String())
 	}
 	versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
 	navIdx := strings.Index(first.Body.String(), `href="/"`)
 	if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
 		t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
 	}
 	if got := first.Header().Get("Cache-Control"); got != "no-store" {
 		t.Fatalf("first cache-control=%q", got)
 	}
@@ -185,6 +470,84 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	}
 }
 func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `id="task-log-overlay"`) {
 		t.Fatalf("tasks page missing log modal overlay: %s", body)
 	}
 	if !strings.Contains(body, `_taskPageSize = 50`) {
 		t.Fatalf("tasks page missing pagination size config: %s", body)
 	}
 	if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
 		t.Fatalf("tasks page missing pagination controls: %s", body)
 	}
 }
 func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `Restart GPU Drivers`) {
 		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
 	}
 	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
 		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
 	}
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
 	}
 	if !strings.Contains(body, `Export to USB`) {
 		t.Fatalf("tools page missing export to usb section: %s", body)
 	}
 	if !strings.Contains(body, `Support Bundle</button>`) {
 		t.Fatalf("tools page missing support bundle usb button: %s", body)
 	}
 }
 func TestTasksPageRendersScrollableLogModal(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
 	exportDir := filepath.Join(dir, "export")
 	if err := os.MkdirAll(exportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	handler := NewHandler(HandlerOptions{
 		Title:     "Bee Hardware Audit",
 		AuditPath: path,
 		ExportDir: exportDir,
 	})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `height:calc(100vh - 32px)`) {
 		t.Fatalf("tasks page missing bounded log modal height: %s", body)
 	}
 	if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
 		t.Fatalf("tasks page missing log modal overflow guard: %s", body)
 	}
 	if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
 		t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
 	}
 }
 func TestViewerRendersLatestSnapshot(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -0,0 +1,42 @@
 package webui
 import (
 	"fmt"
 	"log/slog"
 	"runtime/debug"
 	"time"
 )
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
 		for {
 			if !runRecoverable(name, fn) {
 				return
 			}
 			if restartDelay > 0 {
 				time.Sleep(restartDelay)
 			}
 		}
 	}()
 }
 func goRecoverOnce(name string, fn func()) {
 	go func() {
 		_ = runRecoverable(name, fn)
 	}()
 }
 func runRecoverable(name string, fn func()) (panicked bool) {
 	defer func() {
 		if rec := recover(); rec != nil {
 			panicked = true
 			slog.Error("recovered panic",
 				"component", name,
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 		}
 	}()
 	fn()
 	return false
 }
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -4,10 +4,12 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
 	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -83,16 +85,17 @@ func taskDisplayName(target, profile, loader string) string {
 // Task represents one unit of work in the queue.
 type Task struct {
-	ID        string     `json:"id"`
+	ID         string     `json:"id"`
-	Name      string     `json:"name"`
+	Name       string     `json:"name"`
-	Target    string     `json:"target"`
+	Target     string     `json:"target"`
-	Priority  int        `json:"priority"`
+	Priority   int        `json:"priority"`
-	Status    string     `json:"status"`
+	Status     string     `json:"status"`
-	CreatedAt time.Time  `json:"created_at"`
+	CreatedAt  time.Time  `json:"created_at"`
-	StartedAt *time.Time `json:"started_at,omitempty"`
+	StartedAt  *time.Time `json:"started_at,omitempty"`
-	DoneAt    *time.Time `json:"done_at,omitempty"`
+	DoneAt     *time.Time `json:"done_at,omitempty"`
-	ErrMsg    string     `json:"error,omitempty"`
+	ElapsedSec int        `json:"elapsed_sec,omitempty"`
-	LogPath   string     `json:"log_path,omitempty"`
+	ErrMsg     string     `json:"error,omitempty"`
 	LogPath    string     `json:"log_path,omitempty"`
 	// runtime fields (not serialised)
 	job    *jobState
@@ -101,14 +104,15 @@ type Task struct {
 // taskParams holds optional parameters parsed from the run request.
 type taskParams struct {
-	Duration          int    `json:"duration,omitempty"`
+	Duration           int      `json:"duration,omitempty"`
-	DiagLevel         int    `json:"diag_level,omitempty"`
+	DiagLevel          int      `json:"diag_level,omitempty"`
-	GPUIndices        []int  `json:"gpu_indices,omitempty"`
+	GPUIndices         []int    `json:"gpu_indices,omitempty"`
-	ExcludeGPUIndices []int  `json:"exclude_gpu_indices,omitempty"`
+	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
-	Loader            string `json:"loader,omitempty"`
+	Loader             string   `json:"loader,omitempty"`
-	BurnProfile       string `json:"burn_profile,omitempty"`
+	BurnProfile        string   `json:"burn_profile,omitempty"`
-	DisplayName       string `json:"display_name,omitempty"`
+	DisplayName        string   `json:"display_name,omitempty"`
-	Device            string `json:"device,omitempty"` // for install
+	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
 }
 type persistedTask struct {
@@ -171,13 +175,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
 // taskQueue manages a priority-ordered list of tasks and runs them one at a time.
 type taskQueue struct {
-	mu        sync.Mutex
+	mu          sync.Mutex
-	tasks     []*Task
+	tasks       []*Task
-	trigger   chan struct{}
+	trigger     chan struct{}
-	opts      *HandlerOptions // set by startWorker
+	opts        *HandlerOptions // set by startWorker
-	statePath string
+	statePath   string
-	logsDir   string
+	logsDir     string
-	started   bool
+	started     bool
 	kmsgWatcher *kmsgWatcher
 }
 var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
@@ -289,6 +294,30 @@ func (q *taskQueue) findJob(id string) (*jobState, bool) {
 	return t.job, true
 }
 type taskStreamSource struct {
 	status  string
 	errMsg  string
 	logPath string
 	job     *jobState
 }
 func (q *taskQueue) taskStreamSource(id string) (taskStreamSource, bool) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 	for _, t := range q.tasks {
 		if t.ID != id {
 			continue
 		}
 		return taskStreamSource{
 			status:  t.Status,
 			errMsg:  t.ErrMsg,
 			logPath: t.LogPath,
 			job:     t.job,
 		}, true
 	}
 	return taskStreamSource{}, false
 }
 func (q *taskQueue) hasActiveTarget(target string) bool {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -303,15 +332,19 @@ func (q *taskQueue) hasActiveTarget(target string) bool {
 	return false
 }
-// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
+// snapshot returns a copy of all tasks sorted for display with newest tasks first.
 func (q *taskQueue) snapshot() []Task {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 	out := make([]Task, len(q.tasks))
 	for i, t := range q.tasks {
 		out[i] = *t
 		out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
 	}
 	sort.SliceStable(out, func(i, j int) bool {
 		if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
 			return out[i].CreatedAt.After(out[j].CreatedAt)
 		}
 		si := statusOrder(out[i].Status)
 		sj := statusOrder(out[j].Status)
 		if si != sj {
@@ -320,7 +353,7 @@ func (q *taskQueue) snapshot() []Task {
 		if out[i].Priority != out[j].Priority {
 			return out[i].Priority > out[j].Priority
 		}
-		return out[i].CreatedAt.Before(out[j].CreatedAt)
+		return out[i].Name < out[j].Name
 	})
 	return out
 }
@@ -346,7 +379,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 	if !q.started {
 		q.loadLocked()
 		q.started = true
-		go q.worker()
+		goRecoverLoop("task worker", 2*time.Second, q.worker)
 	}
 	hasPending := q.nextPending() != nil
 	q.mu.Unlock()
@@ -361,47 +394,106 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 func (q *taskQueue) worker() {
 	for {
 		<-q.trigger
-		setCPUGovernor("performance")
+		func() {
-		for {
+			setCPUGovernor("performance")
-			q.mu.Lock()
+			defer setCPUGovernor("powersave")
 			t := q.nextPending()
 			if t == nil {
 				q.mu.Unlock()
 				break
 			}
 			now := time.Now()
 			t.Status = TaskRunning
 			t.StartedAt = &now
 			t.DoneAt = nil
 			t.ErrMsg = ""
 			j := newTaskJobState(t.LogPath)
 			ctx, cancel := context.WithCancel(context.Background())
 			j.cancel = cancel
 			t.job = j
 			q.persistLocked()
 			q.mu.Unlock()
 			q.runTask(t, j, ctx)
 			// Drain all pending tasks and start them in parallel.
 			q.mu.Lock()
-			now2 := time.Now()
+			var batch []*Task
-			t.DoneAt = &now2
+			for {
-			if t.Status == TaskRunning { // not cancelled externally
+				t := q.nextPending()
-				if j.err != "" {
+				if t == nil {
-					t.Status = TaskFailed
+					break
 					t.ErrMsg = j.err
 				} else {
 					t.Status = TaskDone
 				}
 				now := time.Now()
 				t.Status = TaskRunning
 				t.StartedAt = &now
 				t.DoneAt = nil
 				t.ErrMsg = ""
 				j := newTaskJobState(t.LogPath)
 				t.job = j
 				batch = append(batch, t)
 			}
 			if len(batch) > 0 {
 				q.persistLocked()
 			}
 			q.prune()
 			q.persistLocked()
 			q.mu.Unlock()
-		}
+
-		setCPUGovernor("powersave")
+			var wg sync.WaitGroup
 			for _, t := range batch {
 				t := t
 				j := t.job
 				taskCtx, taskCancel := context.WithCancel(context.Background())
 				j.cancel = taskCancel
 				wg.Add(1)
 				goRecoverOnce("task "+t.Target, func() {
 					defer wg.Done()
 					defer taskCancel()
 					q.executeTask(t, j, taskCtx)
 				})
 			}
 			wg.Wait()
 			if len(batch) > 0 {
 				q.mu.Lock()
 				q.prune()
 				q.persistLocked()
 				q.mu.Unlock()
 			}
 		}()
 	}
 }
 func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
 	startedKmsgWatch := false
 	defer q.finalizeTaskRun(t, j)
 	defer func() {
 		if startedKmsgWatch && q.kmsgWatcher != nil {
 			q.kmsgWatcher.NotifyTaskFinished(t.ID)
 		}
 	}()
 	defer func() {
 		if rec := recover(); rec != nil {
 			msg := fmt.Sprintf("task panic: %v", rec)
 			slog.Error("task panic",
 				"task_id", t.ID,
 				"target", t.Target,
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 			j.append("ERROR: " + msg)
 			j.finish(msg)
 		}
 	}()
 	if q.kmsgWatcher != nil && isSATTarget(t.Target) {
 		q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
 		startedKmsgWatch = true
 	}
 	q.runTask(t, j, ctx)
 }
 func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 	now := time.Now()
 	t.DoneAt = &now
 	if t.Status == TaskRunning {
 		if j.err != "" {
 			t.Status = TaskFailed
 			t.ErrMsg = j.err
 		} else {
 			t.Status = TaskDone
 			t.ErrMsg = ""
 		}
 	}
 	q.persistLocked()
 }
 // setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
 // Silently ignores errors (e.g. when cpufreq is not available).
 func setCPUGovernor(governor string) {
@@ -550,6 +642,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			break
 		}
 		opts := resolvePlatformStressPreset(t.params.BurnProfile)
 		opts.Components = t.params.PlatformComponents
 		archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
 	case "audit":
 		if a == nil {
@@ -587,6 +680,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		return
 	}
 	// If the SAT archive was produced, check overall_status and write to component DB.
 	if archive != "" {
 		archivePath := app.ExtractArchivePath(archive)
 		if err == nil {
 			if app.ReadSATOverallStatus(archivePath) == "FAILED" {
 				err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
 			}
 		}
 		if db := q.statusDB(); db != nil {
 			app.ApplySATResultToDB(db, t.Target, archivePath)
 		}
 	}
 	if err != nil {
 		if ctx.Err() != nil {
 			j.append("Aborted.")
@@ -603,6 +709,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	j.finish("")
 }
 func (q *taskQueue) statusDB() *app.ComponentStatusDB {
 	if q.opts == nil || q.opts.App == nil {
 		return nil
 	}
 	return q.opts.App.StatusDB
 }
 func splitLines(s string) []string {
 	var out []string
 	for _, l := range splitNL(s) {
@@ -712,23 +825,83 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 	writeJSON(w, map[string]int{"cancelled": n})
 }
 func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Request) {
 	// Cancel all queued/running tasks in the queue first.
 	globalQueue.mu.Lock()
 	now := time.Now()
 	cancelled := 0
 	for _, t := range globalQueue.tasks {
 		switch t.Status {
 		case TaskPending:
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			cancelled++
 		case TaskRunning:
 			if t.job != nil {
 				t.job.abort()
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			cancelled++
 		}
 	}
 	globalQueue.persistLocked()
 	globalQueue.mu.Unlock()
 	// Kill orphaned test worker processes at the OS level.
 	killed := platform.KillTestWorkers()
 	writeJSON(w, map[string]any{
 		"cancelled": cancelled,
 		"killed":    len(killed),
 		"processes": killed,
 	})
 }
 func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
 	id := r.PathValue("id")
-	// Wait up to 5s for the task to get a job (it may be pending)
+	src, ok := globalQueue.taskStreamSource(id)
-	deadline := time.Now().Add(5 * time.Second)
+	if !ok {
-	var j *jobState
+		http.Error(w, "task not found", http.StatusNotFound)
 	for time.Now().Before(deadline) {
 		if jj, ok := globalQueue.findJob(id); ok {
 			j = jj
 			break
 		}
 		time.Sleep(200 * time.Millisecond)
 	}
 	if j == nil {
 		http.Error(w, "task not found or not yet started", http.StatusNotFound)
 		return
 	}
-	streamJob(w, r, j)
+	if src.job != nil {
 		streamJob(w, r, src.job)
 		return
 	}
 	if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
 		j := newTaskJobState(src.logPath)
 		j.finish(src.errMsg)
 		streamJob(w, r, j)
 		return
 	}
 	if !sseStart(w) {
 		return
 	}
 	sseWrite(w, "", "Task is queued. Waiting for worker...")
 	ticker := time.NewTicker(200 * time.Millisecond)
 	defer ticker.Stop()
 	for {
 		select {
 		case <-ticker.C:
 			src, ok = globalQueue.taskStreamSource(id)
 			if !ok {
 				sseWrite(w, "done", "task not found")
 				return
 			}
 			if src.job != nil {
 				streamSubscribedJob(w, r, src.job)
 				return
 			}
 			if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
 				j := newTaskJobState(src.logPath)
 				j.finish(src.errMsg)
 				streamSubscribedJob(w, r, j)
 				return
 			}
 		case <-r.Context().Done():
 			return
 		}
 	}
 }
 func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
@@ -765,8 +938,18 @@ func (q *taskQueue) loadLocked() {
 			params:    pt.Params,
 		}
 		q.assignTaskLogPathLocked(t)
-		if t.Status == TaskPending || t.Status == TaskRunning {
+		if t.Status == TaskRunning {
-			t.Status = TaskPending
+			// The task was interrupted by a bee-web restart. Child processes
 			// (e.g. bee-gpu-burn-worker) survive the restart in their own
 			// process groups and cannot be cancelled retroactively. Mark the
 			// task as failed so the user can decide whether to re-run it
 			// rather than blindly re-launching duplicate workers.
 			now := time.Now()
 			t.Status = TaskFailed
 			t.DoneAt = &now
 			t.ErrMsg = "interrupted by bee-web restart"
 		} else if t.Status == TaskPending {
 			t.StartedAt = nil
 			t.DoneAt = nil
 			t.ErrMsg = ""
 		}
@@ -806,3 +989,21 @@ func (q *taskQueue) persistLocked() {
 	}
 	_ = os.Rename(tmp, q.statePath)
 }
 func taskElapsedSec(t *Task, now time.Time) int {
 	if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
 		return 0
 	}
 	start := *t.StartedAt
 	if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
 		start = t.CreatedAt
 	}
 	end := now
 	if t.DoneAt != nil && !t.DoneAt.IsZero() {
 		end = *t.DoneAt
 	}
 	if end.Before(start) {
 		return 0
 	}
 	return int(end.Sub(start).Round(time.Second) / time.Second)
 }
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -2,6 +2,8 @@ package webui
 import (
 	"context"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -24,21 +26,34 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
 	}
 	started := time.Now().Add(-time.Minute)
-	task := &Task{
+
-		ID:        "task-1",
+	// A task that was pending (not yet started) must be re-queued on restart.
 	pendingTask := &Task{
 		ID:        "task-pending",
 		Name:      "Memory Burn-in",
 		Target:    "memory-stress",
 		Priority:  2,
-		Status:    TaskRunning,
+		Status:    TaskPending,
 		CreatedAt: time.Now().Add(-2 * time.Minute),
-		StartedAt: &started,
+		params:    taskParams{Duration: 300, BurnProfile: "smoke"},
-		params: taskParams{
+	}
-			Duration:    300,
+	// A task that was running when bee-web crashed must NOT be re-queued —
-			BurnProfile: "smoke",
+	// its child processes (e.g. gpu-burn-worker) survive the restart in
-		},
+	// their own process groups and can't be cancelled retroactively.
 	runningTask := &Task{
 		ID:        "task-running",
 		Name:      "NVIDIA GPU Stress",
 		Target:    "nvidia-stress",
 		Priority:  1,
 		Status:    TaskRunning,
 		CreatedAt: time.Now().Add(-3 * time.Minute),
 		StartedAt: &started,
 		params:    taskParams{Duration: 86400},
 	}
 	for _, task := range []*Task{pendingTask, runningTask} {
 		q.tasks = append(q.tasks, task)
 		q.assignTaskLogPathLocked(task)
 	}
 	q.tasks = append(q.tasks, task)
 	q.assignTaskLogPathLocked(task)
 	q.persistLocked()
 	recovered := &taskQueue{
@@ -48,18 +63,47 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
 	}
 	recovered.loadLocked()
-	if len(recovered.tasks) != 1 {
+	if len(recovered.tasks) != 2 {
-		t.Fatalf("tasks=%d want 1", len(recovered.tasks))
+		t.Fatalf("tasks=%d want 2", len(recovered.tasks))
 	}
-	got := recovered.tasks[0]
+
-	if got.Status != TaskPending {
+	byID := map[string]*Task{}
-		t.Fatalf("status=%q want %q", got.Status, TaskPending)
+	for i := range recovered.tasks {
 		byID[recovered.tasks[i].ID] = recovered.tasks[i]
 	}
-	if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
+
-		t.Fatalf("params=%+v", got.params)
+	// Pending task must be re-queued as pending with params intact.
 	p := byID["task-pending"]
 	if p == nil {
 		t.Fatal("task-pending not found")
 	}
-	if got.LogPath == "" {
+	if p.Status != TaskPending {
-		t.Fatal("expected log path")
+		t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
 	}
 	if p.StartedAt != nil {
 		t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
 	}
 	if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
 		t.Fatalf("pending task: params=%+v", p.params)
 	}
 	if p.LogPath == "" {
 		t.Fatal("pending task: expected log path")
 	}
 	// Running task must be marked failed, not re-queued, to prevent
 	// launching duplicate workers (e.g. a second set of gpu-burn-workers).
 	r := byID["task-running"]
 	if r == nil {
 		t.Fatal("task-running not found")
 	}
 	if r.Status != TaskFailed {
 		t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
 	}
 	if r.ErrMsg == "" {
 		t.Fatal("running task: expected non-empty error message")
 	}
 	if r.DoneAt == nil {
 		t.Fatal("running task: expected done_at to be set")
 	}
 }
@@ -80,6 +124,130 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
 	}
 }
 func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
 	q := &taskQueue{
 		tasks: []*Task{
 			{
 				ID:        "old-running",
 				Name:      "Old Running",
 				Status:    TaskRunning,
 				Priority:  10,
 				CreatedAt: now.Add(-3 * time.Minute),
 			},
 			{
 				ID:        "new-done",
 				Name:      "New Done",
 				Status:    TaskDone,
 				Priority:  0,
 				CreatedAt: now.Add(-1 * time.Minute),
 			},
 			{
 				ID:        "mid-pending",
 				Name:      "Mid Pending",
 				Status:    TaskPending,
 				Priority:  1,
 				CreatedAt: now.Add(-2 * time.Minute),
 			},
 		},
 	}
 	got := q.snapshot()
 	if len(got) != 3 {
 		t.Fatalf("snapshot len=%d want 3", len(got))
 	}
 	if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
 		t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
 	}
 }
 func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
 	dir := t.TempDir()
 	logPath := filepath.Join(dir, "task.log")
 	if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
 		t.Fatal(err)
 	}
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:        "done-1",
 		Name:      "Done Task",
 		Status:    TaskDone,
 		CreatedAt: time.Now(),
 		LogPath:   logPath,
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
 	req.SetPathValue("id", "done-1")
 	rec := httptest.NewRecorder()
 	h := &handler{}
 	h.handleAPITasksStream(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
 		t.Fatalf("body=%q", body)
 	}
 	if !strings.Contains(body, "event: done\n") {
 		t.Fatalf("missing done event: %q", body)
 	}
 }
 func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:        "pending-1",
 		Name:      "Pending Task",
 		Status:    TaskPending,
 		CreatedAt: time.Now(),
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	ctx, cancel := context.WithCancel(context.Background())
 	req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
 	req.SetPathValue("id", "pending-1")
 	rec := httptest.NewRecorder()
 	done := make(chan struct{})
 	go func() {
 		h := &handler{}
 		h.handleAPITasksStream(rec, req)
 		close(done)
 	}()
 	deadline := time.Now().Add(2 * time.Second)
 	for time.Now().Before(deadline) {
 		if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
 			cancel()
 			<-done
 			if rec.Code != http.StatusOK {
 				t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 			}
 			return
 		}
 		time.Sleep(20 * time.Millisecond)
 	}
 	cancel()
 	<-done
 	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
 }
 func TestResolveBurnPreset(t *testing.T) {
 	tests := []struct {
 		profile string
@@ -236,6 +404,26 @@ func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	}
 }
 func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
 	now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
 	created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
 	started := time.Time{}
 	task := &Task{
 		Status:    TaskRunning,
 		CreatedAt: created,
 		StartedAt: &started,
 	}
 	if got := taskElapsedSec(task, now); got != 0 {
 		t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
 	}
 	stale := created.Add(-24 * time.Hour)
 	task.StartedAt = &stale
 	if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
 		t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
 	}
 }
 func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
 	q := &taskQueue{
 		opts: &HandlerOptions{},
@@ -279,3 +467,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
 		t.Fatalf("unexpected error: %q", j.err)
 	}
 }
 func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
 		opts:        &HandlerOptions{App: &app.App{}},
 		statePath:   filepath.Join(dir, "tasks-state.json"),
 		logsDir:     filepath.Join(dir, "tasks"),
 		kmsgWatcher: newKmsgWatcher(nil),
 	}
 	tk := &Task{
 		ID:        "cpu-panic-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskRunning,
 		CreatedAt: time.Now(),
 	}
 	j := &jobState{}
 	orig := runCPUAcceptancePackCtx
 	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
 		panic("boom")
 	}
 	defer func() { runCPUAcceptancePackCtx = orig }()
 	q.executeTask(tk, j, context.Background())
 	if tk.Status != TaskFailed {
 		t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
 	}
 	if tk.DoneAt == nil {
 		t.Fatal("expected done_at to be set")
 	}
 	if !strings.Contains(tk.ErrMsg, "task panic: boom") {
 		t.Fatalf("task error=%q", tk.ErrMsg)
 	}
 	if !strings.Contains(j.err, "task panic: boom") {
 		t.Fatalf("job error=%q", j.err)
 	}
 	q.kmsgWatcher.mu.Lock()
 	activeCount := q.kmsgWatcher.activeCount
 	window := q.kmsgWatcher.window
 	q.kmsgWatcher.mu.Unlock()
 	if activeCount != 0 {
 		t.Fatalf("activeCount=%d want 0", activeCount)
 	}
 	if window != nil {
 		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
 	}
 }
--- a/audit/scripts/resolve-version.sh
+++ b/audit/scripts/resolve-version.sh
@@ -0,0 +1,16 @@
 #!/bin/sh
 set -eu
 tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
 case "${tag}" in
 	v*)
 		printf '%s\n' "${tag#v}"
 		;;
 	"")
 		printf 'dev\n'
 		;;
 	*)
 		printf '%s\n' "${tag}"
 		;;
 esac
--- a/2
+++ b/2
--- a/bible-local/architecture/charting.md
+++ b/bible-local/architecture/charting.md
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
 and polled by the browser every 2 seconds via `<img src="...?t=now">`.
 There is no client-side canvas or JS chart library.
 ## Rule: live charts must be visually uniform
 Live charts are a single UI family, not a set of one-off widgets. New charts and
 changes to existing charts must keep the same rendering model and presentation
 rules unless there is an explicit architectural decision to diverge.
 Default expectations:
 - same server-side SVG pipeline for all live metrics charts
 - same refresh behaviour and failure handling in the browser
 - same canvas size class and card layout
 - same legend placement policy across charts
 - same axis, title, and summary conventions
 - no chart-specific visual exceptions added as a quick fix
 Current default for live charts:
 - legend below the plot area when a chart has 8 series or fewer
 - legend hidden when a chart has more than 8 series
 - 10 equal Y-axis steps across the chart height
 - 1400 x 360 SVG canvas with legend
 - 1400 x 288 SVG canvas without legend
 - full-width card rendering in a single-column stack
 If one chart needs a different layout or legend behaviour, treat that as a
 design-level decision affecting the whole chart family, not as a local tweak to
 just one endpoint.
 ### Why go-analyze/charts
 - Pure Go, no CGO — builds cleanly inside the live-build container
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
 | `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
 | `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
-Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
+Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
 the legend is hidden. The page renders them at `width: 100%` in a
 single-column layout so they always fill the viewport width.
 ### Ring buffers
--- a/bible-local/architecture/runtime-flows.md
+++ b/bible-local/architecture/runtime-flows.md
@@ -60,6 +60,8 @@ Rules:
 - Chromium opens `http://localhost/` — the full interactive web UI
 - SSH is independent from the desktop path
 - serial console support is enabled for VM boot debugging
 - Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
 - Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
 ## ISO build sequence
--- a/bible-local/decisions/2026-04-01-memtest-build-strategy.md
+++ b/bible-local/decisions/2026-04-01-memtest-build-strategy.md
@@ -1,7 +1,7 @@
 # Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
 **Date:** 2026-04-01
-**Status:** active
+**Status:** resolved
 ## Context
@@ -23,6 +23,100 @@ Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026
 So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
 Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
 - the build now completes successfully because memtest is non-blocking by default
 - `lb binary_memtest` still runs and installs `memtest86+`
 - the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
 - but it executes too early for its current target paths:
  - `binary/boot/grub/grub.cfg` is still missing at hook time
  - `binary/isolinux/live.cfg` is still missing at hook time
  - memtest binaries are also still absent in `binary/boot/`
 - later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
 - but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
 So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
 Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
 artifact dated 2026-04-01:
 - the final ISO does contain `boot/memtest86+x64.bin`
 - the final ISO does contain `boot/memtest86+x64.efi`
 - the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
  and `isolinux/live.cfg`
 - so `v3.20-5-g76a9100` was **not** another real memtest regression in the
  shipped ISO
 - the regression was in the build-time validator/debug path in `build.sh`
 Root cause of the false alarm:
 - `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
  successfully listed/extracted members"
 - `iso_list_files` / `iso_extract_file` failures were collapsed into the same
  observable output as "memtest content missing"
 - this made a reader failure look identical to a missing memtest payload
 - as a result, we re-entered the same memtest investigation loop even though
  the real ISO was already correct
 Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
 - once ISO reading was fixed, the post-build debug correctly showed the raw ISO
  still carried live-build's default memtest layout (`live/memtest.bin`,
  `live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
 - that mismatch is expected to trigger project recovery, because `bee` requires
  `boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
 - however, `build.sh` exited before recovery because `set -e` treated a direct
  `iso_memtest_present` return code of `1` as fatal
 - so the next repeated loop was caused by shell control flow, not by proof that
  the recovery design itself was wrong
 ## Known Failed Attempts
 These approaches were already tried and should not be repeated blindly:
 1. Built-in live-build memtest only.
 Reason it failed:
 - `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
 2. Fixing only the memtest file names for Debian Bookworm.
 Reason it failed:
 - correct file names alone do not make the files appear in the final ISO.
 3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
 Reason it failed:
 - in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
 4. Fallback extraction from cached `memtest86+` `.deb`.
 Reason it failed:
 - this was explored already and was not enough to stabilize the final ISO path end-to-end.
 5. Restoring explicit memtest menu entries in source bootloader templates only.
 Reason it failed:
 - memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
 6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
 Reason it failed:
 - the hook runs before those files exist, so the hook cannot patch them there.
 ## What This Means
 When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
 - do not assume the built-in memtest stage is sufficient
 - do not assume `chroot/boot/` will contain memtest payloads
 - do not assume source bootloader templates are the last writer of final ISO configs
 - do not assume the current normal binary hook timing is late enough for final patching
 Any future memtest fix must explicitly identify:
 - where the memtest binaries are reliably available at build time
 - which exact build stage writes the final bootloader configs that land in the ISO
 - and a post-build proof from a real ISO, not only from intermediate workdir files
 - whether the ISO inspection step itself succeeded, rather than merely whether
  the validator printed a memtest warning
 - whether a non-zero probe is intentionally handled inside an `if` / `case`
  context rather than accidentally tripping `set -e`
 ## Decision
 For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
@@ -44,12 +138,87 @@ Project rules from now on:
 Current implementation direction:
 - keep the live-build memtest stage enabled if it helps package acquisition
- but enforce memtest explicitly in a project-owned binary hook
+- do not rely on the current early `binary_hooks` timing for final patching
- patch the generated `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly in the binary stage if memtest entries are missing
+- prefer a post-`lb build` recovery step in `build.sh` that:
  - patches the fully materialized `LB_DIR/binary` tree
  - injects memtest binaries there
  - ensures final bootloader entries there
  - reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
 - also treat ISO validation tooling as part of the critical path:
  - install a stable ISO reader in the builder image
  - fail with an explicit reader error if ISO listing/extraction fails
  - do not treat reader failure as evidence that memtest is missing
  - do not call a probe that may return "needs recovery" as a bare command under
    `set -e`; wrap it in explicit control flow
 ## Consequences
 - Future memtest changes must begin by reading this ADR and the commits listed above.
 - Future memtest changes must also begin by reading the failed-attempt list above.
 - We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
 - Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
 - But validation output is only trustworthy if ISO reading itself succeeded. A
  "missing memtest" warning without a successful ISO read is not evidence.
 - If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
 ## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
 This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
 and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
 ### Components
 **1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
 Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
 those files may not exist yet. Instead:
 - Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
 - Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
 - Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
  If they do not exist, the hook warns and continues (does not fail).
 Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
 **2. Post-`lb build` recovery step in `build.sh`**
 After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
 contains all required memtest artifacts. If not:
 - Copies/extracts memtest binaries into `binary/boot/`.
 - Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
 - Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
  the ISO with the patched tree.
 This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
 step handles the final `binary/` tree after live-build has written all bootloader configs.
 **3. ISO validation hardening**
 The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
 as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
 handled — it does not abort the build prematurely.
 ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
 If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
 This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
 ### Why this works when earlier attempts did not
 The earlier patterns all shared a single flaw: they assumed a single build-time point
 (hook or source template) would be the last writer of bootloader configs and memtest payloads.
 In live-build on Debian Bookworm that assumption is false — live-build continues writing
 bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
 The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
 `binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
 There is no ordering dependency to get wrong.
 ### Do not revert
 Do not remove the recovery step or the hook without a fresh real ISO build proving
 live-build alone produces all four required artifacts:
 - `boot/memtest86+x64.bin`
 - `boot/memtest86+x64.efi`
 - memtest entry in `boot/grub/grub.cfg`
 - memtest entry in `isolinux/live.cfg`
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -32,6 +32,9 @@ contains all of the following:
 Rules:
 - Keep explicit post-build memtest validation in `build.sh`.
 - Treat ISO reader success as a separate prerequisite from memtest content.
  If the reader cannot list or extract from the ISO, that is a validator
  failure, not proof that memtest is missing.
 - If built-in integration does not produce the artifacts above, use a
  deterministic project-owned copy/extract step instead of hoping live-build
  will "start working".
@@ -39,3 +42,21 @@ Rules:
  a real ISO.
 - If you reference memtest files manually, verify the exact package file list
  first for the target Debian release.
 Known bad loops for this repository:
 - Do not retry built-in-only memtest without new evidence. We already proved
  that `lb binary_memtest` can run while the final ISO still has no memtest.
 - Do not assume fixing memtest file names is enough. Correct names did not fix
  the final artifact path.
 - Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
 - Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
  bootloader configs.
 - Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
  timing is late enough to patch final `binary/boot/grub/grub.cfg` or
  `binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
  present yet when the hook executed.
 - Do not treat a validator warning as ground truth until you have confirmed the
  ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
  regression because the final ISO was correct but the validator produced a
  false negative.
--- a/iso/builder/Dockerfile
+++ b/iso/builder/Dockerfile
@@ -17,6 +17,7 @@ RUN apt-get update -qq && apt-get install -y \
    wget \
    curl \
    tar \
    libarchive-tools \
    xz-utils \
    rsync \
    build-essential \
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -8,7 +8,7 @@ NCCL_TESTS_VERSION=2.13.10
 NVCC_VERSION=12.8
 CUBLAS_VERSION=13.0.2.14-1
 CUDA_USERSPACE_VERSION=13.0.96-1
-DCGM_VERSION=4.5.2-1
+DCGM_VERSION=4.5.3-1
 JOHN_JUMBO_COMMIT=67fcf9fe5a
 ROCM_VERSION=6.3.4
 ROCM_SMI_VERSION=7.4.0.60304-76~22.04
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,7 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
+    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -46,7 +46,10 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
 CACHE_LAYOUT_VERSION="2"
 CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
 if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
        && [ -f "$CACHE_LAYOUT_MARKER" ] \
        && [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
    echo "=== NVIDIA cached, skipping build ==="
    echo "cache: $CACHE_DIR"
@@ -130,24 +133,30 @@ else
    echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
 fi
-# Copy ALL userspace library files.
+# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
-# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
+# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
-# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
+# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
 # but still fail with "no OpenCL platforms" because one dependent .so is absent.
 copied_libs=0
 for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
    cp "$f" "$CACHE_DIR/lib/"
    copied_libs=$((copied_libs+1))
 done
 if [ "$copied_libs" -eq 0 ]; then
    echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
    ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
    exit 1
 fi
 for lib in \
    libnvidia-ml \
    libcuda \
    libnvidia-ptxjitcompiler \
-    libnvidia-opencl \
+    libnvidia-opencl; do
-    libnvidia-compiler \
+    if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
-    libnvidia-nvvm \
+        echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
-    libnvidia-fatbinaryloader; do
+        ls "$CACHE_DIR/lib/" | sort >&2 || true
    count=0
    for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
        cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
    done
    if [ "$count" -eq 0 ]; then
        echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
        ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
        exit 1
    fi
 done
@@ -156,23 +165,17 @@ done
 ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
 [ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
-# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
+# Create soname symlinks for every copied versioned library.
-for lib in \
+for versioned in "$CACHE_DIR"/lib/*.so.*; do
-    libnvidia-ml \
+    [ -f "$versioned" ] || continue
    libcuda \
    libnvidia-ptxjitcompiler \
    libnvidia-opencl \
    libnvidia-compiler \
    libnvidia-nvvm \
    libnvidia-fatbinaryloader; do
    versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
    [ -n "$versioned" ] || continue
    base=$(basename "$versioned")
-    ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
+    stem=${base%%.so.*}
-    ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
+    ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
-    echo "${lib}: .so.1 -> $base"
+    ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
 done
 touch "$CACHE_LAYOUT_MARKER"
 echo "=== NVIDIA build complete ==="
 echo "cache: $CACHE_DIR"
 echo "modules: $ko_count .ko files"
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -54,15 +54,8 @@ resolve_audit_version() {
        return 0
    fi
-    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
+    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
    if [ -z "${tag}" ]; then
        tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
    fi
    case "${tag}" in
        audit/v*)
            echo "${tag#audit/v}"
            return 0
            ;;
        v*)
            echo "${tag#v}"
            return 0
@@ -145,6 +138,25 @@ iso_extract_file() {
    return 127
 }
 iso_read_file_list() {
    iso_path="$1"
    out_path="$2"
    iso_list_files "$iso_path" > "$out_path" || return 1
    [ -s "$out_path" ] || return 1
    return 0
 }
 iso_read_member() {
    iso_path="$1"
    iso_member="$2"
    out_path="$3"
    iso_extract_file "$iso_path" "$iso_member" > "$out_path" || return 1
    [ -s "$out_path" ] || return 1
    return 0
 }
 require_iso_reader() {
    command -v bsdtar >/dev/null 2>&1 && return 0
    command -v xorriso >/dev/null 2>&1 && return 0
@@ -237,14 +249,32 @@ dump_memtest_debug() {
        fi
        if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
            iso_files="$(mktemp)"
            iso_grub_cfg="$(mktemp)"
            iso_isolinux_cfg="$(mktemp)"
            echo "-- ISO memtest files --"
-            iso_list_files "$iso_path" | grep 'memtest' | sed 's/^/  /' || echo "  (no memtest files in ISO)"
+            if iso_read_file_list "$iso_path" "$iso_files"; then
                grep 'memtest' "$iso_files" | sed 's/^/  /' || echo "  (no memtest files in ISO)"
            else
                echo "  (failed to list ISO contents)"
            fi
            echo "-- ISO GRUB memtest lines --"
-            iso_extract_file "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo "  (no memtest lines in boot/grub/grub.cfg)"
+            if iso_read_member "$iso_path" boot/grub/grub.cfg "$iso_grub_cfg"; then
                grep -n 'Memory Test\|memtest' "$iso_grub_cfg" || echo "  (no memtest lines in boot/grub/grub.cfg)"
            else
                echo "  (failed to read boot/grub/grub.cfg from ISO)"
            fi
            echo "-- ISO isolinux memtest lines --"
-            iso_extract_file "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo "  (no memtest lines in isolinux/live.cfg)"
+            if iso_read_member "$iso_path" isolinux/live.cfg "$iso_isolinux_cfg"; then
                grep -n 'Memory Test\|memtest' "$iso_isolinux_cfg" || echo "  (no memtest lines in isolinux/live.cfg)"
            else
                echo "  (failed to read isolinux/live.cfg from ISO)"
            fi
            rm -f "$iso_files" "$iso_grub_cfg" "$iso_isolinux_cfg"
        fi
        echo "=== end memtest debug: ${phase} ==="
@@ -272,6 +302,71 @@ memtest_fail() {
    return 0
 }
 iso_memtest_present() {
    iso_path="$1"
    iso_files="$(mktemp)"
    [ -f "$iso_path" ] || return 1
    if command -v bsdtar >/dev/null 2>&1; then
        :
    elif command -v xorriso >/dev/null 2>&1; then
        :
    else
        return 2
    fi
    iso_read_file_list "$iso_path" "$iso_files" || {
        rm -f "$iso_files"
        return 2
    }
    grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
        rm -f "$iso_files"
        return 1
    }
    grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
        rm -f "$iso_files"
        return 1
    }
    grub_cfg="$(mktemp)"
    isolinux_cfg="$(mktemp)"
    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 2
    }
    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 2
    }
    grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 1
    }
    grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 1
    }
    grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 1
    }
    grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 1
    }
    grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 1
    }
    rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
    return 0
 }
 validate_iso_memtest() {
    iso_path="$1"
    echo "=== validating memtest in ISO ==="
@@ -282,65 +377,219 @@ validate_iso_memtest() {
    }
    require_iso_reader "$iso_path" || return 0
-    iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
+    iso_files="$(mktemp)"
-        memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
+    iso_read_file_list "$iso_path" "$iso_files" || {
        memtest_fail "failed to list ISO contents while validating memtest" "$iso_path"
        rm -f "$iso_files"
        return 0
    }
-    iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
+
    grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
        memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
        rm -f "$iso_files"
        return 0
    }
    grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
        memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
        rm -f "$iso_files"
        return 0
    }
    grub_cfg="$(mktemp)"
    isolinux_cfg="$(mktemp)"
-    iso_extract_file "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || {
+    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
-        memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
+        memtest_fail "failed to read boot/grub/grub.cfg from ISO" "$iso_path"
-        rm -f "$grub_cfg" "$isolinux_cfg"
+        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 0
    }
-    iso_extract_file "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || {
+    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
-        memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
+        memtest_fail "failed to read isolinux/live.cfg from ISO" "$iso_path"
-        rm -f "$grub_cfg" "$isolinux_cfg"
+        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 0
    }
    grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
        memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
-        rm -f "$grub_cfg" "$isolinux_cfg"
+        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 0
    }
    grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
        memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
-        rm -f "$grub_cfg" "$isolinux_cfg"
+        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 0
    }
    grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
        memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
-        rm -f "$grub_cfg" "$isolinux_cfg"
+        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 0
    }
    grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
        memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
-        rm -f "$grub_cfg" "$isolinux_cfg"
+        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 0
    }
    grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
        memtest_fail "isolinux memtest path is missing" "$iso_path"
-        rm -f "$grub_cfg" "$isolinux_cfg"
+        rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
        return 0
    }
-    rm -f "$grub_cfg" "$isolinux_cfg"
+    rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
    echo "=== memtest validation OK ==="
 }
 append_memtest_grub_entry() {
    grub_cfg="$1"
    [ -f "$grub_cfg" ] || return 1
    grep -q 'Memory Test (memtest86+)' "$grub_cfg" && return 0
    grep -q '### BEE MEMTEST ###' "$grub_cfg" && return 0
    cat >> "$grub_cfg" <<'EOF'
 ### BEE MEMTEST ###
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
    }
 else
    menuentry "Memory Test (memtest86+)" {
        linux16 /boot/memtest86+x64.bin
    }
 fi
 ### /BEE MEMTEST ###
 EOF
 }
 append_memtest_isolinux_entry() {
    isolinux_cfg="$1"
    [ -f "$isolinux_cfg" ] || return 1
    grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" && return 0
    grep -q '### BEE MEMTEST ###' "$isolinux_cfg" && return 0
    cat >> "$isolinux_cfg" <<'EOF'
 # ### BEE MEMTEST ###
 label memtest
    menu label ^Memory Test (memtest86+)
    linux /boot/memtest86+x64.bin
 # ### /BEE MEMTEST ###
 EOF
 }
 copy_memtest_from_deb() {
    deb="$1"
    dst_boot="$2"
    tmpdir="$(mktemp -d)"
    dpkg-deb -x "$deb" "$tmpdir"
    for f in memtest86+x64.bin memtest86+x64.efi; do
        if [ -f "$tmpdir/boot/$f" ]; then
            cp "$tmpdir/boot/$f" "$dst_boot/$f"
        fi
    done
    rm -rf "$tmpdir"
 }
 reset_live_build_stage() {
    lb_dir="$1"
    stage="$2"
    for root in \
        "$lb_dir/.build" \
        "$lb_dir/.stage" \
        "$lb_dir/auto"; do
        [ -d "$root" ] || continue
        find "$root" -maxdepth 1 \( -name "${stage}" -o -name "${stage}.*" -o -name "*${stage}*" \) -exec rm -rf {} + 2>/dev/null || true
    done
 }
 recover_iso_memtest() {
    lb_dir="$1"
    iso_path="$2"
    binary_boot="$lb_dir/binary/boot"
    grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
    isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
    echo "=== attempting memtest recovery in binary tree ==="
    mkdir -p "$binary_boot"
    for root in \
        "$lb_dir/chroot/boot" \
        "/boot"; do
        for f in memtest86+x64.bin memtest86+x64.efi; do
            if [ ! -f "$binary_boot/$f" ] && [ -f "$root/$f" ]; then
                cp "$root/$f" "$binary_boot/$f"
                echo "memtest recovery: copied $f from $root"
            fi
        done
    done
    if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
        for dir in \
            "$lb_dir/cache/packages.binary" \
            "$lb_dir/cache/packages.chroot" \
            "$lb_dir/chroot/var/cache/apt/archives" \
            "${BEE_CACHE_DIR:-${DIST_DIR}/cache}/lb-packages" \
            "/var/cache/apt/archives"; do
            [ -d "$dir" ] || continue
            deb="$(find "$dir" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
            [ -n "$deb" ] || continue
            echo "memtest recovery: extracting payload from $deb"
            copy_memtest_from_deb "$deb" "$binary_boot"
            break
        done
    fi
    if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
        tmpdl="$(mktemp -d)"
        if (
            cd "$tmpdl" && apt-get download memtest86+ >/dev/null 2>&1
        ); then
            deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
            if [ -n "$deb" ]; then
                echo "memtest recovery: downloaded $deb"
                copy_memtest_from_deb "$deb" "$binary_boot"
            fi
        fi
        rm -rf "$tmpdl"
    fi
    if [ -f "$grub_cfg" ]; then
        append_memtest_grub_entry "$grub_cfg" && echo "memtest recovery: ensured GRUB entry"
    else
        echo "memtest recovery: WARNING: missing $grub_cfg"
    fi
    if [ -f "$isolinux_cfg" ]; then
        append_memtest_isolinux_entry "$isolinux_cfg" && echo "memtest recovery: ensured isolinux entry"
    else
        echo "memtest recovery: WARNING: missing $isolinux_cfg"
    fi
    reset_live_build_stage "$lb_dir" "binary_checksums"
    reset_live_build_stage "$lb_dir" "binary_iso"
    reset_live_build_stage "$lb_dir" "binary_zsync"
    run_optional_step_sh "rebuild live-build checksums after memtest recovery" "91-lb-checksums" "lb binary_checksums 2>&1"
    run_optional_step_sh "rebuild ISO after memtest recovery" "92-lb-binary-iso" "rm -f '$iso_path' && lb binary_iso 2>&1"
    run_optional_step_sh "rebuild zsync after memtest recovery" "93-lb-zsync" "lb binary_zsync 2>&1"
    if [ ! -f "$iso_path" ]; then
        memtest_fail "ISO rebuild was skipped or failed after memtest recovery: $iso_path" "$iso_path"
    fi
 }
 AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
 ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
 ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
-LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
+# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
-LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
+OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
-ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
+mkdir -p "${OUT_DIR}"
 LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
 LOG_ARCHIVE="${OUT_DIR}/${ISO_BASENAME}.logs.tar.gz"
 ISO_OUT="${OUT_DIR}/${ISO_BASENAME}.iso"
 LOG_OUT="${LOG_DIR}/build.log"
 cleanup_build_log() {
@@ -363,7 +612,8 @@ cleanup_build_log() {
    if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
        rm -f "${LOG_ARCHIVE}"
-        tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
+        tar -czf "${LOG_ARCHIVE}" -C "$(dirname "${LOG_DIR}")" "$(basename "${LOG_DIR}")" 2>/dev/null || true
        rm -rf "${LOG_DIR}"
    fi
    exit "${status}"
@@ -451,6 +701,32 @@ run_step_sh() {
    run_step "${step_name}" "${step_slug}" sh -c "${step_script}"
 }
 run_optional_step_sh() {
    step_name="$1"
    step_slug="$2"
    step_script="$3"
    if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
        run_step_sh "${step_name}" "${step_slug}" "${step_script}"
        return 0
    fi
    step_log="${LOG_DIR}/${step_slug}.log"
    echo ""
    echo "=== optional step: ${step_name} ==="
    echo "=== optional step log: ${step_log} ==="
    set +e
    sh -c "${step_script}" > "${step_log}" 2>&1
    step_status=$?
    set -e
    cat "${step_log}"
    if [ "${step_status}" -ne 0 ]; then
        echo "WARNING: optional step failed: ${step_name} (see ${step_log})" >&2
    else
        echo "=== optional step OK: ${step_name} ==="
    fi
 }
 start_build_log
 # Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
@@ -583,7 +859,6 @@ rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-release" \
    "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
@@ -857,6 +1132,17 @@ fi
 ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
 if [ -f "$ISO_RAW" ]; then
    dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
    if iso_memtest_present "$ISO_RAW"; then
        :
    else
        memtest_status=$?
        if [ "$memtest_status" -eq 1 ]; then
            recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
            dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
        elif [ "$memtest_status" -eq 2 ]; then
            memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -14,6 +14,11 @@ menuentry "EASY-BEE" {
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (graphics/KMS)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (load to RAM)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
@@ -24,6 +29,11 @@ menuentry "EASY-BEE (NVIDIA GSP=off)" {
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (fail-safe)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
    initrd  @INITRD_LIVE@
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -5,6 +5,12 @@ label live-@FLAVOUR@-normal
    initrd @INITRD@
    append @APPEND_LIVE@ bee.nvidia.mode=normal
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
@@ -17,6 +23,12 @@ label live-@FLAVOUR@-gsp-off
    initrd @INITRD@
    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (g^raphics/KMS, GSP=off)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -30,6 +30,7 @@ systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
 systemctl enable bee-web.service
 systemctl enable bee-sshsetup.service
 systemctl enable bee-selfheal.timer
 systemctl enable ssh.service
 systemctl enable lightdm.service 2>/dev/null || true
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
 chmod +x /usr/local/bin/bee-selfheal   2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/package-lists/bee-amd.list.chroot
+++ b/iso/builder/config/package-lists/bee-amd.list.chroot
@@ -1,3 +1,6 @@
 # AMD GPU firmware
 firmware-amd-graphics
 # AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
 rocm-smi-lib=%%ROCM_SMI_VERSION%%
 rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -71,9 +71,7 @@ lightdm
 firmware-linux-free
 firmware-linux-nonfree
 firmware-misc-nonfree
 firmware-amd-graphics
 firmware-realtek
 firmware-intel-sound
 firmware-bnx2
 firmware-bnx2x
 firmware-cavium
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -52,6 +52,14 @@ else
    fail "nvidia-smi: NOT FOUND"
 fi
 for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
        ok "$tool found: $p"
    else
        fail "$tool: NOT FOUND"
    fi
 done
 echo ""
 echo "-- NVIDIA modules --"
 KO_DIR="/usr/local/lib/nvidia"
@@ -109,6 +117,40 @@ else
    fail "nvidia-smi: not found in PATH"
 fi
 echo ""
 echo "-- OpenCL / John --"
 if [ -f /etc/OpenCL/vendors/nvidia.icd ]; then
    ok "OpenCL ICD present: /etc/OpenCL/vendors/nvidia.icd"
 else
    fail "OpenCL ICD missing: /etc/OpenCL/vendors/nvidia.icd"
 fi
 if ldconfig -p 2>/dev/null | grep -q "libnvidia-opencl.so.1"; then
    ok "libnvidia-opencl.so.1 present in linker cache"
 else
    fail "libnvidia-opencl.so.1 missing from linker cache"
 fi
 if command -v clinfo >/dev/null 2>&1; then
    if clinfo -l 2>/dev/null | grep -q "Platform"; then
        ok "clinfo: OpenCL platform detected"
    else
        fail "clinfo: no OpenCL platform detected"
    fi
 else
    fail "clinfo: not found in PATH"
 fi
 if command -v john >/dev/null 2>&1; then
    if john --list=opencl-devices 2>/dev/null | grep -q "Device #"; then
        ok "john: OpenCL devices detected"
    else
        fail "john: no OpenCL devices detected"
    fi
 else
    fail "john: not found in PATH"
 fi
 echo ""
 echo "-- lib symlinks --"
 for lib in libnvidia-ml libcuda; do
@@ -129,6 +171,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
    fi
 done
 if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
    ok "timer active: bee-selfheal.timer"
 else
    fail "timer NOT active: bee-selfheal.timer"
 fi
 echo ""
 echo "-- runtime health --"
 if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -1,9 +1,13 @@
 [Unit]
-Description=Bee: on-demand hardware audit (not started automatically)
+Description=Bee: hardware audit
 After=bee-preflight.service bee-network.service bee-nvidia.service
 [Service]
 Type=oneshot
 RemainAfterExit=yes
-ExecStart=/bin/sh -c 'curl -sf -X POST http://localhost/api/audit/run >/dev/null'
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /usr/local/bin/bee audit --runtime auto --output file:/appdata/bee/export/bee-audit.json
 StandardOutput=journal
 StandardError=journal
 [Install]
 WantedBy=multi-user.target
--- a/iso/overlay/etc/systemd/system/bee-selfheal.service
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.service
@@ -0,0 +1,9 @@
 [Unit]
 Description=Bee: periodic runtime self-heal
 After=bee-web.service bee-audit.service bee-preflight.service
 [Service]
 Type=oneshot
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
 StandardOutput=journal
 StandardError=journal
--- a/iso/overlay/etc/systemd/system/bee-selfheal.timer
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.timer
@@ -0,0 +1,11 @@
 [Unit]
 Description=Bee: run self-heal checks periodically
 [Timer]
 OnBootSec=45sec
 OnUnitActiveSec=60sec
 AccuracySec=15sec
 Unit=bee-selfheal.service
 [Install]
 WantedBy=timers.target
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -1,11 +1,12 @@
 [Unit]
 Description=Bee: hardware audit web viewer
 StartLimitIntervalSec=0
 [Service]
 Type=simple
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
 Restart=always
-RestartSec=2
+RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
--- a/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
+++ b/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
@@ -0,0 +1,6 @@
 [Unit]
 Wants=bee-preflight.service
 After=bee-preflight.service
 [Service]
 ExecStartPre=/usr/local/bin/bee-display-mode
--- a/iso/overlay/usr/local/bin/bee-display-mode
+++ b/iso/overlay/usr/local/bin/bee-display-mode
@@ -0,0 +1,54 @@
 #!/bin/sh
 # Select Xorg display mode based on kernel cmdline.
 # Default is the current server-safe path: keep forced fbdev.
 set -eu
 cmdline_param() {
    key="$1"
    for token in $(cat /proc/cmdline 2>/dev/null); do
        case "$token" in
            "$key"=*)
                echo "${token#*=}"
                return 0
                ;;
        esac
    done
    return 1
 }
 log() {
    echo "bee-display-mode: $*"
 }
 mode="$(cmdline_param bee.display || true)"
 if [ -z "$mode" ]; then
    mode="safe"
 fi
 xorg_dir="/etc/X11/xorg.conf.d"
 fbdev_conf="${xorg_dir}/10-fbdev.conf"
 fbdev_park="${xorg_dir}/10-fbdev.conf.disabled"
 mkdir -p "$xorg_dir"
 case "$mode" in
    kms|auto)
        if [ -f "$fbdev_conf" ]; then
            mv "$fbdev_conf" "$fbdev_park"
            log "mode=${mode}; disabled forced fbdev config"
        else
            log "mode=${mode}; fbdev config already disabled"
        fi
        ;;
    safe|fbdev|"")
        if [ -f "$fbdev_park" ] && [ ! -f "$fbdev_conf" ]; then
            mv "$fbdev_park" "$fbdev_conf"
            log "mode=${mode}; restored forced fbdev config"
        else
            log "mode=${mode}; keeping forced fbdev config"
        fi
        ;;
    *)
        log "unknown bee.display=${mode}; keeping forced fbdev config"
        ;;
 esac
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -2,7 +2,7 @@
 set -eu
 SECONDS=5
-SIZE_MB=64
+SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
@@ -68,8 +68,17 @@ trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
 WORKERS=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
    log="${TMP_DIR}/gpu-${id}.log"
-    echo "starting gpu ${id}"
+    gpu_size_mb="${SIZE_MB}"
-    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${SIZE_MB}" >"${log}" 2>&1 &
+    if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
        total_mb=$(nvidia-smi --id="${id}" --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | tr -d '[:space:]')
        if [ -n "${total_mb}" ] && [ "${total_mb}" -gt 0 ] 2>/dev/null; then
            gpu_size_mb=$(( total_mb * 95 / 100 ))
        else
            gpu_size_mb=512
        fi
    fi
    echo "starting gpu ${id} size=${gpu_size_mb}MB"
    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
 done
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -1,10 +1,11 @@
 #!/bin/sh
 set -eu
-SECONDS=300
+DURATION_SEC=300
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
 TEST_SLICE_SECONDS=300
 JOHN_DIR="/usr/local/lib/bee/john/run"
 JOHN_BIN="${JOHN_DIR}/john"
 export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
@@ -116,7 +117,7 @@ ensure_opencl_ready() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
-        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -189,5 +190,54 @@ CHOSEN_FORMAT=$(choose_format) || {
    exit 1
 }
 run_john_loop() {
    opencl_id="$1"
    deadline="$2"
    round=0
    while :; do
        now=$(date +%s)
        remaining=$((deadline - now))
        if [ "${remaining}" -le 0 ]; then
            break
        fi
        round=$((round + 1))
        slice="${remaining}"
        if [ "${slice}" -gt "${TEST_SLICE_SECONDS}" ]; then
            slice="${TEST_SLICE_SECONDS}"
        fi
        echo "device=${opencl_id} round=${round} remaining_sec=${remaining} slice_sec=${slice}"
        ./john --test="${slice}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" || return 1
    done
 }
 PIDS=""
 cleanup() {
    rc=$?
    trap - EXIT INT TERM
    for pid in ${PIDS}; do
        kill "${pid}" 2>/dev/null || true
    done
    for pid in ${PIDS}; do
        wait "${pid}" 2>/dev/null || true
    done
    exit "${rc}"
 }
 trap cleanup EXIT INT TERM
 echo "format=${CHOSEN_FORMAT}"
-exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}"
+echo "target_seconds=${DURATION_SEC}"
 echo "slice_seconds=${TEST_SLICE_SECONDS}"
 DEADLINE=$(( $(date +%s) + DURATION_SEC ))
 _first=1
 for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
    [ "${_first}" = "1" ] || sleep 3
    _first=0
    run_john_loop "${opencl_id}" "${DEADLINE}" &
    pid=$!
    PIDS="${PIDS} ${pid}"
 done
 FAIL=0
 for pid in ${PIDS}; do
    wait "${pid}" || FAIL=$((FAIL+1))
 done
 [ "${FAIL}" -eq 0 ] || { echo "john: ${FAIL} device(s) failed" >&2; exit 1; }
--- a/iso/overlay/usr/local/bin/bee-network.sh
+++ b/iso/overlay/usr/local/bin/bee-network.sh
@@ -6,25 +6,66 @@ LOG_PREFIX="bee-network"
 log() { echo "[$LOG_PREFIX] $*"; }
-# find physical interfaces: exclude lo and virtual (docker/virbr/veth/tun/tap)
+list_interfaces() {
-interfaces=$(ip -o link show \
+    ip -o link show \
-    | awk -F': ' '{print $2}' \
+        | awk -F': ' '{print $2}' \
-    | grep -v '^lo$' \
+        | grep -v '^lo$' \
-    | grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
+        | grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
-    | sort)
+        | sort
 }
-if [ -z "$interfaces" ]; then
+# Give udev a short chance to expose late NICs before the first scan.
 if command -v udevadm >/dev/null 2>&1; then
    udevadm settle --timeout=5 >/dev/null 2>&1 || log "WARN: udevadm settle timed out"
 fi
 started_ifaces=""
 started_count=0
 scan_pass=1
 # Some server NICs appear a bit later after module/firmware init. Do a small
 # bounded rescan window without turning network bring-up into a boot blocker.
 while [ "$scan_pass" -le 3 ]; do
    interfaces=$(list_interfaces)
    if [ -n "$interfaces" ]; then
        for iface in $interfaces; do
            case " $started_ifaces " in
                *" $iface "*) continue ;;
            esac
            log "bringing up $iface"
            if ! ip link set "$iface" up; then
                log "WARN: could not bring up $iface"
                continue
            fi
            carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
            if [ "$carrier" = "1" ]; then
                log "carrier detected on $iface"
            else
                log "carrier not detected yet on $iface"
            fi
            # DHCP in background — non-blocking, keep dhclient verbose output in the service log.
            dhclient -4 -v -nw "$iface" &
            log "DHCP started for $iface (pid $!)"
            started_ifaces="$started_ifaces $iface"
            started_count=$((started_count + 1))
        done
    fi
    if [ "$scan_pass" -ge 3 ]; then
        break
    fi
    scan_pass=$((scan_pass + 1))
    sleep 2
 done
 if [ "$started_count" -eq 0 ]; then
    log "no physical interfaces found"
    exit 0
 fi
-for iface in $interfaces; do
+log "done (interfaces started: $started_count)"
    log "bringing up $iface"
    ip link set "$iface" up || { log "WARN: could not bring up $iface"; continue; }
    # DHCP in background — non-blocking, keep dhclient verbose output in the service log.
    dhclient -4 -v -nw "$iface" &
    log "DHCP started for $iface (pid $!)"
 done
 log "done"
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
 log "ldconfig refreshed"
 # Start DCGM host engine so dcgmi can discover GPUs.
-# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
+# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
-# "group is empty" even when GPUs and modules are present.
+# If it started too early (for example via systemd before bee-nvidia-load), it can
-# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
+# keep a stale empty inventory and dcgmi diag later reports no testable entities.
 if command -v nv-hostengine >/dev/null 2>&1; then
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
-        log "nv-hostengine already running — skipping"
+        if command -v pkill >/dev/null 2>&1; then
-    else
+            pkill -x nv-hostengine >/dev/null 2>&1 || true
            tries=0
            while pgrep -x nv-hostengine >/dev/null 2>&1; do
                tries=$((tries + 1))
                if [ "${tries}" -ge 10 ]; then
                    log "WARN: nv-hostengine is still running after restart request"
                    break
                fi
                sleep 1
            done
            if pgrep -x nv-hostengine >/dev/null 2>&1; then
                log "WARN: keeping existing nv-hostengine process"
            else
                log "nv-hostengine restarted"
            fi
        else
            log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
        fi
    fi
    if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        nv-hostengine
        log "nv-hostengine started"
    fi
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -24,7 +24,7 @@ chromium \
    --no-first-run \
    --disable-session-crashed-bubble \
    --disable-features=TranslateUI \
-    --start-fullscreen \
+    --start-maximized \
    http://localhost/ &
 exec openbox
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -0,0 +1,99 @@
 #!/bin/bash
 # bee-selfheal — periodic best-effort recovery for critical live ISO services.
 set -u
 LOG_PREFIX="bee-selfheal"
 EXPORT_DIR="/appdata/bee/export"
 AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
 RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
 LOCK_DIR="/run/bee-selfheal.lock"
 log() {
    echo "[${LOG_PREFIX}] $*"
 }
 have_nvidia_gpu() {
    lspci -nn 2>/dev/null | grep -qi '10de:'
 }
 service_active() {
    systemctl is-active --quiet "$1" 2>/dev/null
 }
 restart_service() {
    local svc="$1"
    if systemctl restart "$svc" >/dev/null 2>&1; then
        log "restarted ${svc}"
        return 0
    fi
    log "WARN: failed to restart ${svc}"
    return 1
 }
 file_ready() {
    [ -s "$1" ]
 }
 artifact_state() {
    local path="$1"
    if [ -s "${path}" ]; then
        echo "ready"
        return 0
    fi
    if [ -e "${path}.tmp" ]; then
        echo "interrupted"
        return 0
    fi
    echo "missing"
 }
 web_healthy() {
    bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
        >/dev/null 2>&1
 }
 mkdir -p "${EXPORT_DIR}" /run
 if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
    log "another self-heal run is already active"
    exit 0
 fi
 trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
 log "start"
 if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
    log "NVIDIA GPU detected but /dev/nvidia0 is missing"
    restart_service bee-nvidia.service || true
 fi
 runtime_state="$(artifact_state "${RUNTIME_JSON}")"
 if [ "${runtime_state}" != "ready" ]; then
    if [ "${runtime_state}" = "interrupted" ]; then
        log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
    else
        log "runtime-health.json missing or empty"
    fi
    restart_service bee-preflight.service || true
 fi
 audit_state="$(artifact_state "${AUDIT_JSON}")"
 if [ "${audit_state}" != "ready" ]; then
    if [ "${audit_state}" = "interrupted" ]; then
        log "bee-audit.json.tmp exists — interrupted audit write detected"
    else
        log "bee-audit.json missing or empty"
    fi
    restart_service bee-audit.service || true
 fi
 if ! service_active bee-web.service; then
    log "bee-web.service is not active"
    restart_service bee-web.service || true
 elif ! web_healthy; then
    log "bee-web health check failed"
    restart_service bee-web.service || true
 fi
 log "done"
--- a/iso/overlay/usr/local/bin/netconf
+++ b/iso/overlay/usr/local/bin/netconf
@@ -3,6 +3,11 @@
 # Type 'a' at any prompt to abort, 'b' to go back.
 set -e
 # Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
 if [ "$(id -u)" -ne 0 ]; then
    exec sudo "$0" "$@"
 fi
 abort() { echo "Aborted."; exit 0; }
 ask() {
Author	SHA1	Message	Date
Michael Chus	f58c7e58d3	Fix webui streaming recovery regressions	2026-04-05 10:39:09 +03:00
Michael Chus	bf47c8dbd2	Add NVIDIA benchmark reporting flow	2026-04-05 10:30:56 +03:00
Michael Chus	143b7dca5d	Add stability hardening and self-heal recovery	2026-04-05 10:29:37 +03:00
Michael Chus	9826d437a5	Add GPU clock charts and grouped GPU metrics view	2026-04-05 09:57:38 +03:00
Mikhail Chusavitin	f3c14cd893	Harden NIC probing for empty SFP ports	2026-04-04 15:23:15 +03:00
Mikhail Chusavitin	728270dc8e	Unblock bee-web startup and expand support bundle diagnostics	2026-04-04 15:18:43 +03:00
Mikhail Chusavitin	8692f825bc	Use plain repo tags for build version	2026-04-03 10:48:51 +03:00
Mikhail Chusavitin	11f52ac710	Fix task log modal scrolling	2026-04-03 10:36:11 +03:00
Mikhail Chusavitin	1cb398fe83	Show tag version at top of sidebar	2026-04-03 10:08:00 +03:00
Mikhail Chusavitin	7a843be6b0	Stabilize DCGM GPU discovery	2026-04-03 09:50:33 +03:00
Mikhail Chusavitin	7f6386dccc	Restore USB support bundle export on tools page	2026-04-03 09:48:22 +03:00
Mikhail Chusavitin	eea2591bcc	Fix John GPU stress duration semantics	2026-04-03 09:46:16 +03:00
Mikhail Chusavitin	295a19b93a	feat(tasks): run all queued tasks in parallel Tasks are now started simultaneously when multiple are enqueued (e.g. Run All). The worker drains all pending tasks at once and launches each in its own goroutine, waiting via WaitGroup. kmsg watcher updated to use a shared event window with a reference counter across concurrent tasks. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 09:15:06 +03:00
Mikhail Chusavitin	444a7d16cc	fix(iso): increase boot verbosity for service startup visibility Raise loglevel from 3 to 6 (INFO) and add systemd.show_status=1 so kernel driver messages and systemd [ OK ]/[ FAILED ] lines are visible during boot instead of showing only a blank cursor. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:33:27 +03:00
Mikhail Chusavitin	fd722692a4	feat(watchdog): hardware error monitor + unified component status store - Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:20:59 +03:00
Mikhail Chusavitin	99cece524c	feat(support-bundle): add PCIe link diagnostics and system logs - Add full dmesg (was tail -200), kern.log, syslog - Add /proc/cmdline, lspci -vvv, nvidia-smi -q - Add per-GPU PCIe link speed/width from sysfs (NVIDIA devices only) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 15:42:28 +03:00
Mikhail Chusavitin	c27449c60e	feat(webui): show current boot source	2026-04-02 15:36:32 +03:00
Mikhail Chusavitin	5ef879e307	feat(webui): add gpu driver restart action	2026-04-02 15:30:23 +03:00
Mikhail Chusavitin	e7df63bae1	fix(app): include extra system logs in support bundle	2026-04-02 13:44:58 +03:00
Mikhail Chusavitin	17ff3811f8	fix(webui): improve tasks logs and ordering	2026-04-02 13:43:59 +03:00
Mikhail Chusavitin	fc7fe0b08e	fix(webui): build support bundle synchronously on download, bypass task queue Support bundle is now built on-the-fly when the user clicks the button, regardless of whether other tasks are running: - GET /export/support.tar.gz builds the bundle synchronously and streams it directly to the client; the temp archive is removed after serving - Remove POST /api/export/bundle and handleAPIExportBundle — the task-queue approach meant the bundle could only be downloaded after navigating away and back, and was blocked entirely while a long SAT test was running - UI: single "Download Support Bundle" button; fetch+blob gives a loading state ("Building...") while the server collects logs, then triggers the browser download with the correct filename from Content-Disposition Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 12:58:00 +03:00
Mikhail Chusavitin	3cf75a541a	build: collect ISO and logs under versioned dist/easy-bee-v{VERSION}/ dir All final artefacts for a given version now land in one place: dist/easy-bee-v4.1/ easy-bee-nvidia-v4.1-amd64.iso easy-bee-nvidia-v4.1-amd64.logs.tar.gz ← log archive (logs dir deleted after archiving) - Introduce OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}" - Move LOG_DIR, LOG_ARCHIVE, and ISO_OUT into OUT_DIR - cleanup_build_log: use dirname(LOG_DIR) as tar -C base so the path is correct regardless of where OUT_DIR lives; delete LOG_DIR after archiving Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 10:19:11 +03:00
Mikhail Chusavitin	1f750d3edd	fix(webui): prevent orphaned workers on restart, reduce metrics polling, add Kill Workers button - tasks: mark TaskRunning tasks as TaskFailed on bee-web restart instead of re-queueing them — prevents duplicate gpu-burn-worker spawns when bee-web crashes mid-test (each restart was launching a new set of 8 workers on top of still-alive orphans from the previous crash) - server: reduce metrics collector interval 1s→5s, grow ring buffer to 360 samples (30 min); cuts nvidia-smi/ipmitool/sensors subprocess rate by 5× - platform: add KillTestWorkers() — scans /proc and SIGKILLs bee-gpu-burn, stress-ng, stressapptest, memtester without relying on pkill/killall - webui: add "Kill Workers" button next to Cancel All; calls POST /api/tasks/kill-workers which cancels the task queue then kills orphaned OS-level processes; shows toast with killed count - metricsdb: sort GPU indices and fan/temp names after map iteration to fix non-deterministic sample reconstruction order (flaky test) - server: fix chartYAxisNumber to use one decimal place for 1000–9999 (e.g. "1,7к" instead of "2к") so Y-axis ticks are distinguishable Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 10:13:43 +03:00
Mikhail Chusavitin	b2b0444131	audit: ignore virtual hdisk and coprocessor noise	2026-04-02 09:56:17 +03:00
Michael Chus	dbab43db90	Fix full-history metrics range loading	2026-04-01 23:55:28 +03:00
Michael Chus	bcb7fe5fe9	Render charts from full SQLite history	2026-04-01 23:52:54 +03:00
Michael Chus	d21d9d191b	fix(build): bump DCGM to 4.5.3-1 — core package updated in CUDA repo Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-01 23:49:57 +03:00
Michael Chus	ef45246ea0	fix(sat): kill entire process group on task cancel exec.CommandContext only kills the direct child (the shell script), leaving grandchildren (john, gpu-burn, etc.) as orphans. Set Setpgid so each SAT job runs in its own process group, then send SIGKILL to the whole group (-pgid) in the Cancel hook. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-01 23:46:33 +03:00
Michael Chus	348db35119	fix(stress): stagger john GPU launches to prevent GWS tuning contention When 8 john processes start simultaneously they race for GPU memory during OpenCL GWS auto-tuning. Slower devices settle on a smaller work size (~594MiB vs 762MiB) and run at 40% instead of 100% load. Add 3s sleep between launches so each instance finishes memory allocation before the next one starts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-01 23:44:00 +03:00
Michael Chus	1dd7f243f5	Keep chart series colors stable	2026-04-01 23:37:57 +03:00
Michael Chus	938e499ac2	Serve charts from SQLite history only	2026-04-01 23:33:13 +03:00
Michael Chus	964ab39656	fix: run john stress in parallel per GPU, fix chromium fullscreen, filter BMC virtual disks - bee-john-gpu-stress: spawn one john process per OpenCL device in parallel so all GPUs are stressed simultaneously instead of only device 1 - bee-openbox-session: --start-fullscreen → --start-maximized to fix blank white page on first render in fbdev environment - storage collector: skip Virtual HDisk* devices reported by BMC/iDRAC Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-01 23:14:21 +03:00
Michael Chus	c2aecc6ce9	Fix fan chart gaps and task durations	2026-04-01 22:36:11 +03:00
Michael Chus	439b86ce59	Unify live metrics chart rendering	2026-04-01 22:19:33 +03:00
Michael Chus	eb60100297	fix: pcie gen, nccl binary, netconf sudo, boot noise, firmware cleanup - nvidia collector: read pcie.link.gen.current/max from nvidia-smi instead of sysfs to avoid false Gen1 readings when GPU is in ASPM idle state - build: remove bee-nccl-gpu-stress from rm -f list so shell script from overlay is not silently dropped from the ISO - smoketest: add explicit checks for bee-gpu-burn, bee-john-gpu-stress, bee-nccl-gpu-stress, all_reduce_perf - netconf: re-exec via sudo when not root to fix RTNETLINK/resolv.conf errors - auto/config: reduce loglevel 7→3 to show clean systemd output on boot - auto/config: blacklist snd_hda_intel and related audio modules (unused on servers) - package-lists: remove firmware-intel-sound and firmware-amd-graphics from base list; move firmware-amd-graphics to bee-amd variant only - bible-local: mark memtest ADR resolved, document working solution Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-01 21:25:23 +03:00
Mikhail Chusavitin	2baf3be640	Handle memtest recovery probe under set -e	2026-04-01 17:42:13 +03:00
Mikhail Chusavitin	d92f8f41d0	Fix memtest ISO validation false negatives	2026-04-01 12:22:17 +03:00
Mikhail Chusavitin	76a9100779	fix(iso): rebuild image after memtest recovery	2026-04-01 10:01:14 +03:00
Mikhail Chusavitin	1b6d592bf3	feat(iso): add optional kms display boot path	2026-04-01 09:42:59 +03:00
Mikhail Chusavitin	c95bbff23b	fix(metrics): stabilize cpu and power sampling	2026-04-01 09:40:42 +03:00
Mikhail Chusavitin	4e4debd4da	refactor(webui): redesign Burn tab and fix gpu-burn memory defaults - Burn tab: replace 6 flat cards with 3 grouped cards (GPU Stress, Compute Stress, Platform Thermal Cycling) + global Burn Profile - Run All button at top enqueues all enabled tests across all cards - GPU Stress: tool checkboxes enabled/disabled via new /api/gpu/tools endpoint based on driver status (/dev/nvidia0, /dev/kfd) - Compute Stress: checkboxes for cpu/memory-stress/stressapptest - Platform Thermal Cycling: component checkboxes (cpu/nvidia/amd) with platform_components param wired through to PlatformStressOptions - bee-gpu-burn: default size-mb changed from 64 to 0 (auto); script now queries nvidia-smi memory.total per GPU and uses 95% of it - platform_stress: removed hardcoded --size-mb 64; respects Components field to selectively run CPU and/or GPU load goroutines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-01 09:39:07 +03:00
Mikhail Chusavitin	5839f870b7	fix(iso): include full nvidia opencl runtime	2026-04-01 09:16:06 +03:00
Mikhail Chusavitin	b447717a5a	fix(iso): harden boot network bring-up - v3.20	2026-04-01 09:10:55 +03:00
Mikhail Chusavitin	f6f4923ac9	fix(iso): recover memtest after live-build	2026-04-01 08:55:57 +03:00