Benchmark: parallel GPU mode, resilient inventory query, server model in results

- Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
fix logo wallpaper
2026-04-07 18:32:15 +03:00 · 2026-04-07 10:15:38 +03:00 · 2026-04-06 22:30:59 +03:00 · 2026-04-06 22:26:52 +03:00 · 2026-04-06 21:06:21 +03:00 · 2026-04-06 21:06:16 +03:00
76 changed files with 10826 additions and 1021 deletions
--- a/audit/Makefile
+++ b/audit/Makefile
@@ -1,7 +1,10 @@
 LISTEN ?= :8080
 AUDIT_PATH ?=
 EXPORT_DIR ?= $(CURDIR)/.tmp/export
 VERSION ?= $(shell sh ./scripts/resolve-version.sh)
 GO_LDFLAGS := -X main.Version=$(VERSION)
-RUN_ARGS := web --listen $(LISTEN)
+RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
 ifneq ($(AUDIT_PATH),)
 RUN_ARGS += --audit-path $(AUDIT_PATH)
 endif
@@ -9,10 +12,11 @@ endif
 .PHONY: run build test
 run:
-	go run ./cmd/bee $(RUN_ARGS)
+	mkdir -p $(EXPORT_DIR)
 	go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
 build:
-	go build -o bee ./cmd/bee
+	go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
 test:
 	go test ./...
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -8,6 +8,7 @@ import (
 	"log/slog"
 	"os"
 	"runtime/debug"
 	"strconv"
 	"strings"
 	"bee/audit/internal/app"
@@ -21,30 +22,7 @@ var Version = "dev"
 func buildLabel() string {
 	label := strings.TrimSpace(Version)
 	if label == "" {
-		label = "dev"
+		return "dev"
 	}
 	if info, ok := debug.ReadBuildInfo(); ok {
 		var revision string
 		var modified bool
 		for _, setting := range info.Settings {
 			switch setting.Key {
 			case "vcs.revision":
 				revision = setting.Value
 			case "vcs.modified":
 				modified = setting.Value == "true"
 			}
 		}
 		if revision != "" {
 			short := revision
 			if len(short) > 12 {
 				short = short[:12]
 			}
 			label += " (" + short
 			if modified {
 				label += "+"
 			}
 			label += ")"
 		}
 	}
 	return label
 }
@@ -53,10 +31,19 @@ func main() {
 	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
 }
-func run(args []string, stdout, stderr io.Writer) int {
+func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level: slog.LevelInfo,
 	})))
 	defer func() {
 		if rec := recover(); rec != nil {
 			slog.Error("fatal panic",
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 			exitCode = 1
 		}
 	}()
 	if len(args) == 0 {
 		printRootUsage(stderr)
@@ -82,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
 	case "benchmark":
 		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -98,8 +87,9 @@ func printRootUsage(w io.Writer) {
  bee preflight --output stdout|file:<path>
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
-  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
+  bee web     --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
 }
@@ -118,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
 	case "benchmark":
 		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -304,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("web", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
-	auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
+	auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
 	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
 	title := fs.String("title", "Bee Hardware Audit", "page title")
 	fs.Usage = func() {
@@ -407,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
 func runBenchmark(args []string, stdout, stderr io.Writer) int {
 	if len(args) == 0 {
 		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 2
 	}
 	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
 		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 0
 	}
 	target := args[0]
 	if target != "nvidia" {
 		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
 		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 2
 	}
 	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
 	devices := fs.String("devices", "", "comma-separated GPU indices to include")
 	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
 	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
 	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
 	if err := fs.Parse(args[1:]); err != nil {
 		if err == flag.ErrHelp {
 			return 0
 		}
 		return 2
 	}
 	if fs.NArg() != 0 {
 		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
 		return 2
 	}
 	includeIndices, err := parseBenchmarkIndexCSV(*devices)
 	if err != nil {
 		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
 		return 2
 	}
 	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
 	if err != nil {
 		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
 		return 2
 	}
 	application := app.New(platform.New())
 	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
 	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
 		Profile:           *profile,
 		SizeMB:            *sizeMB,
 		GPUIndices:        includeIndices,
 		ExcludeGPUIndices: excludeIndices,
 		RunNCCL:           !*skipNCCL,
 	}, logLine)
 	if err != nil {
 		slog.Error("run benchmark", "target", target, "err", err)
 		return 1
 	}
 	slog.Info("benchmark archive written", "target", target, "path", archive)
 	return 0
 }
 func parseBenchmarkIndexCSV(raw string) ([]int, error) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return nil, nil
 	}
 	var indices []int
 	for _, part := range strings.Split(raw, ",") {
 		part = strings.TrimSpace(part)
 		if part == "" {
 			continue
 		}
 		value, err := strconv.Atoi(part)
 		if err != nil || value < 0 {
 			return nil, fmt.Errorf("bad gpu index %q", part)
 		}
 		indices = append(indices, value)
 	}
 	return indices, nil
 }
--- a/audit/cmd/bee/main_test.go
+++ b/audit/cmd/bee/main_test.go
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
 }
 func TestRunVersion(t *testing.T) {
 	t.Parallel()
 	old := Version
 	Version = "test-version"
 	t.Cleanup(func() { Version = old })
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
 	}
 }
 func TestBuildLabelUsesVersionAsIs(t *testing.T) {
 	old := Version
 	Version = "1.2.3"
 	t.Cleanup(func() { Version = old })
 	if got := buildLabel(); got != "1.2.3" {
 		t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
 	}
 }
 func TestRunExportRequiresTarget(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,17 +19,18 @@ import (
 )
 var (
-	DefaultExportDir       = "/appdata/bee/export"
+	DefaultExportDir        = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
+	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
+	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
+	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
+	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
+	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
+	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
 	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
 )
 type App struct {
@@ -40,6 +41,8 @@ type App struct {
 	sat       satRunner
 	runtime   runtimeChecker
 	installer installer
 	// StatusDB is the unified component health store (nil if unavailable).
 	StatusDB *ComponentStatusDB
 }
 type ActionResult struct {
@@ -112,6 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 type satRunner interface {
 	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -136,7 +145,7 @@ type runtimeChecker interface {
 }
 func New(platform *platform.System) *App {
-	return &App{
+	a := &App{
 		network:   platform,
 		services:  platform,
 		exports:   platform,
@@ -145,6 +154,10 @@ func New(platform *platform.System) *App {
 		runtime:   platform,
 		installer: platform,
 	}
 	if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
 		a.StatusDB = db
 	}
 	return a
 }
 // ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
@@ -154,7 +167,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
 	if err != nil {
 		return nil, err
 	}
-	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
 	return json.MarshalIndent(snap, "", "  ")
 }
@@ -174,7 +187,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		}
 	}
 	result := collector.Run(runtimeMode)
-	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -189,10 +202,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -217,10 +227,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -526,10 +533,56 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
 	return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
 }
 func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBenchmarkBaseDir
 	}
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -880,6 +933,12 @@ func latestSATSummaries() []string {
 		prefix string
 	}{
 		{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
 		{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
 		{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
 		{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
 		{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
 		{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
 		{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
 		{label: "Memory SAT", prefix: "memory-"},
 		{label: "Storage SAT", prefix: "storage-"},
 		{label: "CPU SAT", prefix: "cpu-"},
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -120,15 +120,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }
 type fakeSAT struct {
-	runNvidiaFn       func(string) (string, error)
+	runNvidiaFn               func(string) (string, error)
-	runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
+	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
-	runMemoryFn       func(string) (string, error)
+	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
-	runStorageFn      func(string) (string, error)
+	runNvidiaComputeFn        func(string, int, []int) (string, error)
-	runCPUFn          func(string, int) (string, error)
+	runNvidiaPowerFn          func(string, int, []int) (string, error)
-	detectVendorFn    func() string
+	runNvidiaPulseFn          func(string, int, []int) (string, error)
-	listAMDGPUsFn     func() ([]platform.AMDGPUInfo, error)
+	runNvidiaBandwidthFn      func(string, []int) (string, error)
-	runAMDPackFn      func(string) (string, error)
+	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
-	listNvidiaGPUsFn  func() ([]platform.NvidiaGPU, error)
+	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
 	runCPUFn                  func(string, int) (string, error)
 	detectVendorFn            func() string
 	listAMDGPUsFn             func() ([]platform.AMDGPUInfo, error)
 	runAMDPackFn              func(string) (string, error)
 	listNvidiaGPUsFn          func() ([]platform.NvidiaGPU, error)
 }
 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +145,48 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
 	if f.runNvidiaBenchmarkFn != nil {
 		return f.runNvidiaBenchmarkFn(baseDir, opts)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaComputeFn != nil {
 		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaPowerFn != nil {
 		return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaPulseFn != nil {
 		return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaBandwidthFn != nil {
 		return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
 	if f.runNvidiaStressFn != nil {
 		return f.runNvidiaStressFn(baseDir, opts)
@@ -754,6 +802,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 		}
 	}
 	for _, want := range []string{
 		"/system/ip-link.txt",
 		"/system/ip-link-stats.txt",
 		"/system/ethtool-info.txt",
 		"/system/ethtool-link.txt",
 		"/system/ethtool-module.txt",
 		"/system/mstflint-query.txt",
 	} {
 		var found bool
 		for _, name := range names {
 			if contains(name, want) {
 				found = true
 				break
 			}
 		}
 		if !found {
 			t.Fatalf("support bundle missing %s, names=%v", want, names)
 		}
 	}
 	var foundRaw bool
 	for _, name := range names {
 		if contains(name, "/export/bee-sat/memory-run/verbose.log") {
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -0,0 +1,48 @@
 package app
 import (
 	"fmt"
 	"os"
 	"path/filepath"
 )
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
 	}
 	tmpPath := path + ".tmp"
 	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
 	if err != nil {
 		return fmt.Errorf("open temp %s: %w", tmpPath, err)
 	}
 	success := false
 	defer func() {
 		_ = f.Close()
 		if !success {
 			_ = os.Remove(tmpPath)
 		}
 	}()
 	if _, err := f.Write(data); err != nil {
 		return fmt.Errorf("write temp %s: %w", tmpPath, err)
 	}
 	if err := f.Sync(); err != nil {
 		return fmt.Errorf("sync temp %s: %w", tmpPath, err)
 	}
 	if err := f.Close(); err != nil {
 		return fmt.Errorf("close temp %s: %w", tmpPath, err)
 	}
 	if err := os.Rename(tmpPath, path); err != nil {
 		return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
 	}
 	if dir, err := os.Open(filepath.Dir(path)); err == nil {
 		_ = dir.Sync()
 		_ = dir.Close()
 	}
 	success = true
 	return nil
 }
--- a/audit/internal/app/atomic_write_test.go
+++ b/audit/internal/app/atomic_write_test.go
@@ -0,0 +1,71 @@
 package app
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"testing"
 	"bee/audit/internal/schema"
 )
 func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "bee-audit.json")
 	if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
 		t.Fatalf("seed file: %v", err)
 	}
 	if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
 		t.Fatalf("atomicWriteFile: %v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("read final: %v", err)
 	}
 	if string(raw) != "new\n" {
 		t.Fatalf("final content=%q want %q", string(raw), "new\n")
 	}
 	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
 		t.Fatalf("tmp file should be absent after success, err=%v", err)
 	}
 }
 func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "runtime-health.json")
 	a := &App{
 		runtime: fakeRuntime{
 			collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
 				return schema.RuntimeHealth{
 					Status:      "OK",
 					ExportDir:   exportDir,
 					DriverReady: true,
 					CUDAReady:   true,
 				}, nil
 			},
 		},
 	}
 	got, err := a.RunRuntimePreflight("file:" + path)
 	if err != nil {
 		t.Fatalf("RunRuntimePreflight: %v", err)
 	}
 	if got != path {
 		t.Fatalf("path=%q want %q", got, path)
 	}
 	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
 		t.Fatalf("tmp file should be absent after success, err=%v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("read runtime file: %v", err)
 	}
 	var health schema.RuntimeHealth
 	if err := json.Unmarshal(raw, &health); err != nil {
 		t.Fatalf("json unmarshal: %v", err)
 	}
 	if health.Status != "OK" {
 		t.Fatalf("status=%q want OK", health.Status)
 	}
 }
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -0,0 +1,268 @@
 package app
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 )
 // ComponentStatusDB is a persistent, append-only store of hardware component health records.
 // Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
 // Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
 // the component stays at the highest observed severity until explicitly reset.
 type ComponentStatusDB struct {
 	path    string
 	mu      sync.Mutex
 	records map[string]*ComponentStatusRecord
 }
 // ComponentStatusRecord holds the current and historical health of one hardware component.
 type ComponentStatusRecord struct {
 	ComponentKey  string                 `json:"component_key"`
 	Status        string                 `json:"status"` // "OK", "Warning", "Critical", "Unknown"
 	LastCheckedAt time.Time              `json:"last_checked_at"`
 	LastChangedAt time.Time              `json:"last_changed_at"`
 	ErrorSummary  string                 `json:"error_summary,omitempty"`
 	History       []ComponentStatusEntry `json:"history"`
 }
 // ComponentStatusEntry is one observation written to a component's history.
 type ComponentStatusEntry struct {
 	At     time.Time `json:"at"`
 	Status string    `json:"status"`
 	Source string    `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
 	Detail string    `json:"detail,omitempty"`
 }
 // OpenComponentStatusDB opens (or creates) the JSON status DB at path.
 func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	db := &ComponentStatusDB{
 		path:    path,
 		records: make(map[string]*ComponentStatusRecord),
 	}
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
 	data, err := os.ReadFile(path)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
 	if len(data) > 0 {
 		var records []ComponentStatusRecord
 		if err := json.Unmarshal(data, &records); err == nil {
 			for i := range records {
 				db.records[records[i].ComponentKey] = &records[i]
 			}
 		}
 	}
 	return db, nil
 }
 // Record writes one observation for the given component key.
 // source is a short label like "sat:nvidia" or "watchdog:kmsg".
 // status is "OK", "Warning", "Critical", or "Unknown".
 // OK never downgrades an existing Warning or Critical status.
 func (db *ComponentStatusDB) Record(key, source, status, detail string) {
 	if db == nil || strings.TrimSpace(key) == "" {
 		return
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	now := time.Now().UTC()
 	rec, exists := db.records[key]
 	if !exists {
 		rec = &ComponentStatusRecord{ComponentKey: key}
 		db.records[key] = rec
 	}
 	rec.LastCheckedAt = now
 	entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
 	rec.History = append(rec.History, entry)
 	// Status merge: OK never downgrades Warning/Critical.
 	newSev := componentSeverity(status)
 	curSev := componentSeverity(rec.Status)
 	if newSev > curSev {
 		rec.Status = status
 		rec.LastChangedAt = now
 		rec.ErrorSummary = detail
 	} else if rec.Status == "" {
 		rec.Status = status
 		rec.LastChangedAt = now
 	}
 	_ = db.saveLocked()
 }
 // Get returns the current record for a component key.
 func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
 	if db == nil {
 		return ComponentStatusRecord{}, false
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	r, ok := db.records[key]
 	if !ok {
 		return ComponentStatusRecord{}, false
 	}
 	return *r, true
 }
 // All returns a snapshot of all records.
 func (db *ComponentStatusDB) All() []ComponentStatusRecord {
 	if db == nil {
 		return nil
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	out := make([]ComponentStatusRecord, 0, len(db.records))
 	for _, r := range db.records {
 		out = append(out, *r)
 	}
 	return out
 }
 func (db *ComponentStatusDB) saveLocked() error {
 	records := make([]ComponentStatusRecord, 0, len(db.records))
 	for _, r := range db.records {
 		records = append(records, *r)
 	}
 	data, err := json.MarshalIndent(records, "", "  ")
 	if err != nil {
 		return err
 	}
 	return os.WriteFile(db.path, data, 0644)
 }
 // componentSeverity returns a numeric severity so higher values win.
 func componentSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
 		return 3
 	case "Warning":
 		return 2
 	case "OK":
 		return 1
 	default:
 		return 0
 	}
 }
 // ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
 // and writes component status records to db for the given SAT target.
 // archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
 func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
 	if db == nil || strings.TrimSpace(archivePath) == "" {
 		return
 	}
 	archivePath = extractArchivePath(archivePath)
 	if archivePath == "" {
 		return
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return
 	}
 	kv := parseSATKV(string(data))
 	overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
 	if overall == "" {
 		return
 	}
 	source := "sat:" + target
 	dbStatus := satStatusToDBStatus(overall)
 	// Map SAT target to component keys.
 	switch target {
 	case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
 		"amd-stress", "amd-mem", "amd-bandwidth":
 		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
 	case "memory", "memory-stress", "sat-stress":
 		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
 	case "cpu", "platform-stress":
 		db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
 	case "storage":
 		// Try to record per-device if available in summary.
 		recordedAny := false
 		for key, val := range kv {
 			if !strings.HasSuffix(key, "_status") || key == "overall_status" {
 				continue
 			}
 			base := strings.TrimSuffix(key, "_status")
 			idx := strings.Index(base, "_")
 			if idx <= 0 {
 				continue
 			}
 			devName := base[:idx]
 			devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
 			db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
 			recordedAny = true
 		}
 		if !recordedAny {
 			db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
 		}
 	}
 }
 func satStatusToDBStatus(overall string) string {
 	switch overall {
 	case "OK":
 		return "OK"
 	case "FAILED":
 		return "Warning"
 	case "PARTIAL", "UNSUPPORTED":
 		return "Unknown"
 	default:
 		return "Unknown"
 	}
 }
 // ExtractArchivePath extracts a bare .tar.gz path from a string that may be
 // "Archive written to /path/foo.tar.gz" or already a bare path.
 func ExtractArchivePath(s string) string {
 	return extractArchivePath(s)
 }
 // ReadSATOverallStatus reads the overall_status value from the summary.txt
 // file located in the run directory alongside archivePath.
 // Returns "" if the file cannot be read.
 func ReadSATOverallStatus(archivePath string) string {
 	if strings.TrimSpace(archivePath) == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return ""
 	}
 	kv := parseSATKV(string(data))
 	return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
 }
 func extractArchivePath(s string) string {
 	s = strings.TrimSpace(s)
 	if strings.HasSuffix(s, ".tar.gz") {
 		parts := strings.Fields(s)
 		if len(parts) > 0 {
 			return parts[len(parts)-1]
 		}
 	}
 	return s
 }
 func parseSATKV(raw string) map[string]string {
 	kv := make(map[string]string)
 	for _, line := range strings.Split(raw, "\n") {
 		k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
 		if ok {
 			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
 		}
 	}
 	return kv
 }
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -9,7 +9,7 @@ import (
 	"bee/audit/internal/schema"
 )
-func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
+func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
 	if snap == nil || strings.TrimSpace(baseDir) == "" {
 		return
 	}
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
 	if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
 		applyStorageSAT(snap.Storage, summary)
 	}
 	// Apply unified component status DB — overlaid last so it can only upgrade severity.
 	applyComponentStatusDB(snap, db)
 }
 type satSummary struct {
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
 	}
 }
 func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
 	if snap == nil || db == nil {
 		return
 	}
 	for _, rec := range db.All() {
 		key := rec.ComponentKey
 		status := dbStatusToSATStatus(rec.Status)
 		if status == "" {
 			continue
 		}
 		detail := rec.ErrorSummary
 		ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
 		switch {
 		case strings.HasPrefix(key, "pcie:"):
 			bdf := strings.TrimPrefix(key, "pcie:")
 			bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
 			// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
 			if sanitizeBDFForLookup(bdf) == "" {
 				break
 			}
 			normalized := sanitizeBDFForLookup(bdf)
 			for i := range snap.PCIeDevices {
 				if snap.PCIeDevices[i].BDF == nil {
 					continue
 				}
 				if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
 					mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
 				}
 			}
 		case strings.HasPrefix(key, "storage:"):
 			devName := strings.TrimPrefix(key, "storage:")
 			if devName == "all" {
 				for i := range snap.Storage {
 					mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
 				}
 			} else {
 				for i := range snap.Storage {
 					linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
 					if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
 						mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
 					}
 				}
 			}
 		case strings.HasPrefix(key, "memory:"):
 			for i := range snap.Memory {
 				mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
 			}
 		case strings.HasPrefix(key, "cpu:"):
 			for i := range snap.CPUs {
 				mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
 			}
 		}
 	}
 }
 // dbStatusToSATStatus converts ComponentStatusDB status strings to the format
 // expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
 func dbStatusToSATStatus(s string) string {
 	switch strings.TrimSpace(s) {
 	case "OK", "Warning", "Critical", "Unknown":
 		return s
 	default:
 		return ""
 	}
 }
 // sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
 // suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
 func sanitizeBDFForLookup(bdf string) string {
 	bdf = strings.ToLower(strings.TrimSpace(bdf))
 	if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
 		return ""
 	}
 	if strings.Count(bdf, ":") == 1 {
 		bdf = "0000:" + bdf
 	}
 	return bdf
 }
 func ptrString(v *string) string {
 	if v == nil {
 		return ""
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
 	usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
 	snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
 		t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		}},
 	}
-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
 	"bee-network.service",
 	"bee-nvidia.service",
 	"bee-preflight.service",
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 }
@@ -32,6 +34,8 @@ var supportBundleCommands = []struct {
 	{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
 	{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
 	{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
 	{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
 	{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
 	{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
@@ -47,6 +51,83 @@ for d in /sys/bus/pci/devices/*/; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
 `}},
 	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool -i "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool -m "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
 if ! command -v mstflint >/dev/null 2>&1; then
  echo "mstflint not found"
  exit 0
 fi
 found=0
 for path in /sys/bus/pci/devices/*; do
  [ -e "$path/vendor" ] || continue
  vendor=$(cat "$path/vendor" 2>/dev/null)
  [ "$vendor" = "0x15b3" ] || continue
  bdf=$(basename "$path")
  found=1
  echo "=== $bdf ==="
  mstflint -d "$bdf" q 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no Mellanox/NVIDIA networking devices found"
 fi
 `}},
 }
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -2,18 +2,21 @@ package collector
 import (
 	"bee/audit/internal/schema"
 	"context"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"time"
 )
 const mellanoxVendorID = 0x15b3
 const nicProbeTimeout = 2 * time.Second
 var (
 	mstflintQuery = func(bdf string) (string, error) {
-		out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
 		if err != nil {
 			return "", err
 		}
@@ -21,7 +24,7 @@ var (
 	}
 	ethtoolInfoQuery = func(iface string) (string, error) {
-		out, err := exec.Command("ethtool", "-i", iface).Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
 		if err != nil {
 			return "", err
 		}
@@ -29,6 +32,14 @@ var (
 	}
 	netIfacesByBDF = listNetIfacesByBDF
 	readNetCarrierFile = func(iface string) (string, error) {
 		path := filepath.Join("/sys/class/net", iface, "carrier")
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			return "", err
 		}
 		return strings.TrimSpace(string(raw)), nil
 	}
 )
 // enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
 	}
 	return ifaces
 }
 func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	defer cancel()
 	return exec.CommandContext(ctx, name, args...).Output()
 }
 func interfaceHasCarrier(iface string) bool {
 	raw, err := readNetCarrierFile(iface)
 	if err != nil {
 		return false
 	}
 	return strings.TrimSpace(raw) == "1"
 }
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -12,7 +12,7 @@ import (
 var (
 	ethtoolModuleQuery = func(iface string) (string, error) {
-		out, err := raidToolQuery("ethtool", "-m", iface)
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
 		if err != nil {
 			return "", err
 		}
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}
-		if out, err := ethtoolModuleQuery(iface); err == nil {
+		if interfaceHasCarrier(iface) {
-			if injectSFPDOMTelemetry(&devs[i], out) {
+			if out, err := ethtoolModuleQuery(iface); err == nil {
-				enriched++
+				if injectSFPDOMTelemetry(&devs[i], out) {
-				continue
+					enriched++
 					continue
 				}
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
 	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		queryPCILSPCIDetail = origDetail
 		readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
 		readNetCarrierFile = origCarrier
 	})
 	queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		}
 		return "aa:bb:cc:dd:ee:ff", nil
 	}
 	readNetCarrierFile = func(string) (string, error) { return "1", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	}
 }
 func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
 	origIfaces := netIfacesByBDF
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
 	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		netIfacesByBDF = origIfaces
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
 		readNetCarrierFile = origCarrier
 	})
 	netIfacesByBDF = func(string) []string { return []string{"eth0"} }
 	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
 	readNetCarrierFile = func(string) (string, error) { return "0", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) {
 		t.Fatal("ethtool -m should not be called without carrier")
 		return "", nil
 	}
 	class := "EthernetController"
 	bdf := "0000:18:00.0"
 	devs := []schema.HardwarePCIeDevice{{
 		DeviceClass: &class,
 		BDF:         &bdf,
 	}}
 	out := enrichPCIeWithNICTelemetry(devs)
 	if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
 		t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
 	}
 }
 func TestDBMValue(t *testing.T) {
 	tests := []struct {
 		in   string
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,252 @@
 package platform
 import (
 	"fmt"
 	"os"
 	"path/filepath"
 	"regexp"
 	"strings"
 	"time"
 )
 func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 	return renderBenchmarkReportWithCharts(result, nil)
 }
 type benchmarkReportChart struct {
 	Title   string
 	Content string
 }
 var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
 func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
 	fmt.Fprintf(&b, "===========================\n\n")
 	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
 	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
 	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
 	if len(result.Findings) > 0 {
 		fmt.Fprintf(&b, "Executive Summary\n")
 		fmt.Fprintf(&b, "-----------------\n")
 		for _, finding := range result.Findings {
 			fmt.Fprintf(&b, "- %s\n", finding)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.Warnings) > 0 {
 		fmt.Fprintf(&b, "Warnings\n")
 		fmt.Fprintf(&b, "--------\n")
 		for _, warning := range result.Warnings {
 			fmt.Fprintf(&b, "- %s\n", warning)
 		}
 		b.WriteString("\n")
 	}
 	fmt.Fprintf(&b, "Per GPU Scorecard\n")
 	fmt.Fprintf(&b, "-----------------\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
 		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
 		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
 		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
 		if gpu.Scores.TOPSPerSMPerGHz > 0 {
 			fmt.Fprintf(&b, "  Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
 		}
 		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
 		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
 		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
 		if gpu.Scores.InterconnectScore > 0 {
 			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
 		}
 		if len(gpu.DegradationReasons) > 0 {
 			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
 		}
 		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
 		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
 		if len(gpu.PrecisionResults) > 0 {
 			fmt.Fprintf(&b, "  Precision results:\n")
 			for _, precision := range gpu.PrecisionResults {
 				if precision.Supported {
 					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
 				} else {
 					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
 				}
 			}
 		}
 		fmt.Fprintf(&b, "  Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
 		if len(gpu.Notes) > 0 {
 			fmt.Fprintf(&b, "  Notes:\n")
 			for _, note := range gpu.Notes {
 				fmt.Fprintf(&b, "    - %s\n", note)
 			}
 		}
 		b.WriteString("\n")
 	}
 	if result.Interconnect != nil {
 		fmt.Fprintf(&b, "Interconnect\n")
 		fmt.Fprintf(&b, "------------\n")
 		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
 			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
 			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
 		}
 		for _, note := range result.Interconnect.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
 		b.WriteString("\n")
 	}
 	if len(charts) > 0 {
 		fmt.Fprintf(&b, "Terminal Charts\n")
 		fmt.Fprintf(&b, "---------------\n")
 		for _, chart := range charts {
 			content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
 			if content == "" {
 				continue
 			}
 			fmt.Fprintf(&b, "%s\n", chart.Title)
 			fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
 			fmt.Fprintf(&b, "%s\n\n", content)
 		}
 	}
 	if sp := result.ServerPower; sp != nil {
 		fmt.Fprintf(&b, "Server Power (IPMI)\n")
 		fmt.Fprintf(&b, "-------------------\n")
 		if !sp.Available {
 			fmt.Fprintf(&b, "Unavailable\n")
 		} else {
 			fmt.Fprintf(&b, "  Server idle:         %.0f W\n", sp.IdleW)
 			fmt.Fprintf(&b, "  Server under load:   %.0f W\n", sp.LoadedW)
 			fmt.Fprintf(&b, "  Server delta:        %.0f W\n", sp.DeltaW)
 			fmt.Fprintf(&b, "  GPU reported (sum):  %.0f W\n", sp.GPUReportedSumW)
 			if sp.ReportingRatio > 0 {
 				fmt.Fprintf(&b, "  Reporting ratio:     %.2f  (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
 			}
 		}
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "  Note: %s\n", note)
 		}
 		b.WriteString("\n")
 	}
 	fmt.Fprintf(&b, "Methodology\n")
 	fmt.Fprintf(&b, "-----------\n")
 	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
 	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
 	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
 	fmt.Fprintf(&b, "Raw Files\n")
 	fmt.Fprintf(&b, "---------\n")
 	fmt.Fprintf(&b, "- result.json\n")
 	fmt.Fprintf(&b, "- report.txt\n")
 	fmt.Fprintf(&b, "- summary.txt\n")
 	fmt.Fprintf(&b, "- verbose.log\n")
 	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
 	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
 	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
 	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
 	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
 	if result.Interconnect != nil {
 		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
 	}
 	return b.String()
 }
 func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
 	phases := []struct {
 		name  string
 		label string
 	}{
 		{name: "baseline", label: "Baseline"},
 		{name: "steady", label: "Steady State"},
 		{name: "cooldown", label: "Cooldown"},
 	}
 	var charts []benchmarkReportChart
 	for _, idx := range gpuIndices {
 		for _, phase := range phases {
 			path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
 			raw, err := os.ReadFile(path)
 			if err != nil || len(raw) == 0 {
 				continue
 			}
 			charts = append(charts, benchmarkReportChart{
 				Title:   fmt.Sprintf("GPU %d %s", idx, phase.label),
 				Content: string(raw),
 			})
 		}
 	}
 	return charts
 }
 func stripANSIEscapeSequences(raw string) string {
 	return ansiEscapePattern.ReplaceAllString(raw, "")
 }
 // formatThrottleLine renders throttle counters as human-readable percentages of
 // the steady-state window.  Only non-zero counters are shown.  When the steady
 // duration is unknown (0), raw seconds are shown instead.
 func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
 	type counter struct {
 		label string
 		us    uint64
 	}
 	counters := []counter{
 		{"sw_power", t.SWPowerCapUS},
 		{"sw_thermal", t.SWThermalSlowdownUS},
 		{"sync_boost", t.SyncBoostUS},
 		{"hw_thermal", t.HWThermalSlowdownUS},
 		{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
 	}
 	var parts []string
 	for _, c := range counters {
 		if c.us == 0 {
 			continue
 		}
 		sec := float64(c.us) / 1e6
 		if steadyDurationSec > 0 {
 			pct := sec / steadyDurationSec * 100
 			parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
 		} else if sec < 1 {
 			parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
 		} else {
 			parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
 		}
 	}
 	if len(parts) == 0 {
 		return "none"
 	}
 	return strings.Join(parts, "  ")
 }
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
 	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
 	var best float64
 	for i, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
 		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
 		if i == 0 || gpu.Scores.CompositeScore > best {
 			best = gpu.Scores.CompositeScore
 		}
 	}
 	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
 	if result.Interconnect != nil {
 		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
 		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
 	}
 	return b.String()
 }
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,179 @@
 package platform
 import (
 	"strings"
 	"testing"
 )
 func TestResolveBenchmarkProfile(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name    string
 		profile string
 		want    benchmarkProfileSpec
 	}{
 		{
 			name:    "default",
 			profile: "",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
 		},
 		{
 			name:    "stability",
 			profile: "stability",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
 		},
 		{
 			name:    "overnight",
 			profile: "overnight",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
 		},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got := resolveBenchmarkProfile(tc.profile)
 			if got != tc.want {
 				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
 			}
 		})
 	}
 }
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()
 	opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
 		Profile: "stability",
 		RunNCCL: false,
 	})
 	if opts.Profile != NvidiaBenchmarkProfileStability {
 		t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
 	}
 	if opts.RunNCCL {
 		t.Fatalf("RunNCCL should stay false when explicitly disabled")
 	}
 }
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()
 	raw := strings.Join([]string{
 		"loader=bee-gpu-burn",
 		"[gpu 0] device=NVIDIA H100",
 		"[gpu 0] compute_capability=9.0",
 		"[gpu 0] backend=cublasLt",
 		"[gpu 0] duration_s=10",
 		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
 		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
 		"[gpu 0] fp16_tensor_iterations=200",
 		"[gpu 0] fp8_e4m3_iterations=50",
 		"[gpu 0] status=OK",
 	}, "\n")
 	got := parseBenchmarkBurnLog(raw)
 	if got.Backend != "cublasLt" {
 		t.Fatalf("backend=%q want cublasLt", got.Backend)
 	}
 	if got.ComputeCapability != "9.0" {
 		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
 	}
 	if len(got.Profiles) != 2 {
 		t.Fatalf("profiles=%d want 2", len(got.Profiles))
 	}
 	if got.Profiles[0].TeraOpsPerSec <= 0 {
 		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
 	}
 	if got.Profiles[1].Category != "fp8" {
 		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
 	}
 }
 func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 	t.Parallel()
 	result := NvidiaBenchmarkResult{
 		BenchmarkVersion:   benchmarkVersion,
 		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
 		OverallStatus:      "PARTIAL",
 		SelectedGPUIndices: []int{0},
 		Normalization: BenchmarkNormalization{
 			Status: "partial",
 		},
 		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
 		GPUs: []BenchmarkGPUResult{
 			{
 				Index:  0,
 				Name:   "NVIDIA H100",
 				Status: "OK",
 				Steady: BenchmarkTelemetrySummary{
 					AvgPowerW:           680,
 					AvgTempC:            79,
 					AvgGraphicsClockMHz: 1725,
 					P95PowerW:           700,
 					P95TempC:            82,
 					P95GraphicsClockMHz: 1800,
 				},
 				Scores: BenchmarkScorecard{
 					ComputeScore:        1200,
 					PowerSustainScore:   96,
 					ThermalSustainScore: 88,
 					StabilityScore:      92,
 					CompositeScore:      1176,
 				},
 				PrecisionResults: []BenchmarkPrecisionResult{
 					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
 				},
 				Throttle: BenchmarkThrottleCounters{
 					SWPowerCapUS: 1000000,
 				},
 				DegradationReasons: []string{"power_capped"},
 			},
 		},
 	}
 	report := renderBenchmarkReport(result)
 	for _, needle := range []string{
 		"Executive Summary",
 		"GPU 0 spent measurable time under SW power cap.",
 		"Composite score: 1176.00",
 		"fp16_tensor: 700.00 TOPS",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
 }
 func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
 	t.Parallel()
 	report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
 		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
 		OverallStatus:      "OK",
 		SelectedGPUIndices: []int{0},
 		Normalization: BenchmarkNormalization{
 			Status: "full",
 		},
 	}, []benchmarkReportChart{
 		{
 			Title:   "GPU 0 Steady State",
 			Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
 		},
 	})
 	for _, needle := range []string{
 		"Terminal Charts",
 		"GPU 0 Steady State",
 		"GPU 0 chart",
 		"42┤───",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
 	if strings.Contains(report, "\x1b[31m") {
 		t.Fatalf("report should not contain ANSI escapes\n%s", report)
 	}
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,158 @@
 package platform
 import "time"
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
 	ParallelGPUs      bool // run all selected GPUs simultaneously instead of sequentially
 }
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion   string                       `json:"benchmark_version"`
 	GeneratedAt        time.Time                    `json:"generated_at"`
 	Hostname           string                       `json:"hostname,omitempty"`
 	ServerModel        string                       `json:"server_model,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
 	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
 	Warnings           []string                     `json:"warnings,omitempty"`
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
 }
 type BenchmarkNormalization struct {
 	Status string                      `json:"status"`
 	Notes  []string                    `json:"notes,omitempty"`
 	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
 }
 type BenchmarkNormalizationGPU struct {
 	Index                 int      `json:"index"`
 	PersistenceMode       string   `json:"persistence_mode,omitempty"`
 	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
 	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
 	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
 	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
 }
 type BenchmarkGPUResult struct {
 	Index                  int                        `json:"index"`
 	UUID                   string                     `json:"uuid,omitempty"`
 	Name                   string                     `json:"name,omitempty"`
 	BusID                  string                     `json:"bus_id,omitempty"`
 	VBIOS                  string                     `json:"vbios,omitempty"`
 	ComputeCapability      string                     `json:"compute_capability,omitempty"`
 	Backend                string                     `json:"backend,omitempty"`
 	Status                 string                     `json:"status"`
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
 	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
 	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
 	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
 	Baseline               BenchmarkTelemetrySummary  `json:"baseline"`
 	Steady                 BenchmarkTelemetrySummary  `json:"steady"`
 	Cooldown               BenchmarkTelemetrySummary  `json:"cooldown"`
 	Throttle               BenchmarkThrottleCounters  `json:"throttle_counters"`
 	PrecisionResults       []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
 	Scores                 BenchmarkScorecard         `json:"scores"`
 	DegradationReasons     []string                   `json:"degradation_reasons,omitempty"`
 	Notes                  []string                   `json:"notes,omitempty"`
 }
 type BenchmarkTelemetrySummary struct {
 	DurationSec         float64 `json:"duration_sec"`
 	Samples             int     `json:"samples"`
 	AvgTempC            float64 `json:"avg_temp_c"`
 	P95TempC            float64 `json:"p95_temp_c"`
 	AvgPowerW           float64 `json:"avg_power_w"`
 	P95PowerW           float64 `json:"p95_power_w"`
 	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
 	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
 	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
 	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
 	AvgUsagePct         float64 `json:"avg_usage_pct"`
 	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
 	ClockCVPct          float64 `json:"clock_cv_pct"`
 	PowerCVPct          float64 `json:"power_cv_pct"`
 	TempCVPct           float64 `json:"temp_cv_pct"`
 	ClockDriftPct       float64 `json:"clock_drift_pct"`
 }
 type BenchmarkThrottleCounters struct {
 	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
 	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
 	SyncBoostUS            uint64 `json:"sync_boost_us"`
 	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
 	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
 }
 type BenchmarkPrecisionResult struct {
 	Name          string  `json:"name"`
 	Category      string  `json:"category"`
 	Supported     bool    `json:"supported"`
 	Lanes         int     `json:"lanes,omitempty"`
 	M             uint64  `json:"m,omitempty"`
 	N             uint64  `json:"n,omitempty"`
 	K             uint64  `json:"k,omitempty"`
 	Iterations    uint64  `json:"iterations,omitempty"`
 	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
 	Notes         string  `json:"notes,omitempty"`
 }
 type BenchmarkScorecard struct {
 	ComputeScore        float64 `json:"compute_score"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
 	StabilityScore      float64 `json:"stability_score"`
 	InterconnectScore   float64 `json:"interconnect_score"`
 	CompositeScore      float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	// Comparable across throttle levels and GPU generations. Low value at normal
 	// clocks indicates silicon degradation.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }
 // BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
 // power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
 // telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
 // over-reporting its power consumption.
 type BenchmarkServerPower struct {
 	Available       bool     `json:"available"`
 	IdleW           float64  `json:"idle_w,omitempty"`
 	LoadedW         float64  `json:"loaded_w,omitempty"`
 	DeltaW          float64  `json:"delta_w,omitempty"`
 	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
 	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
 	Notes           []string `json:"notes,omitempty"`
 }
 type BenchmarkInterconnectResult struct {
 	Status             string   `json:"status"`
 	Attempted          bool     `json:"attempted"`
 	Supported          bool     `json:"supported"`
 	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
 	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
 	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
 	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
 	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
 	Notes              []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/error_patterns.go
+++ b/audit/internal/platform/error_patterns.go
@@ -0,0 +1,139 @@
 package platform
 import "regexp"
 // ErrorPattern describes a kernel log pattern that indicates a hardware error.
 // Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
 type ErrorPattern struct {
 	// Name is a short machine-readable label for logging and deduplication.
 	Name string
 	// Re is the compiled regular expression matched against a single kmsg line.
 	Re *regexp.Regexp
 	// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
 	Category string
 	// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
 	Severity string
 	// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
 	// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
 	BDFGroup int
 	// DevGroup is the capture group index (1-based) that contains a device name
 	// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
 	DevGroup int
 }
 // HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
 // To add a new pattern: append a new ErrorPattern struct to this slice.
 var HardwareErrorPatterns = []ErrorPattern{
 	// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
 	{
 		Name:     "nvidia-rminitadapter",
 		Re:       mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "nvidia-msi-fail",
 		Re:       mustPat(`(?i)NVRM:.*Failed to enable MSI`),
 		Category: "gpu",
 		Severity: "warning",
 	},
 	{
 		Name:     "nvidia-aer",
 		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "nvidia-xid",
 		Re:       mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	// ── PCIe AER (generic) ──────────────────────────────────────────────────────
 	{
 		Name:     "pcie-aer",
 		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "pcie-uncorrectable",
 		Re:       mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "pcie-link-down",
 		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	// ── Storage ─────────────────────────────────────────────────────────────────
 	{
 		Name:     "blk-io-error",
 		Re:       mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	{
 		Name:     "nvme-timeout",
 		Re:       mustPat(`(?i)nvme\s+(\w+):.*timeout`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	{
 		Name:     "scsi-failed",
 		Re:       mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
 		Category: "storage",
 		Severity: "warning",
 	},
 	{
 		Name:     "nvme-reset",
 		Re:       mustPat(`(?i)nvme\s+(\w+):.*reset`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	// ── Machine Check Exceptions ────────────────────────────────────────────────
 	{
 		Name:     "mce-hardware-error",
 		Re:       mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
 		Category: "mce",
 		Severity: "warning",
 	},
 	{
 		Name:     "mce-corrected",
 		Re:       mustPat(`(?i)mce:.*[Cc]orrected`),
 		Category: "mce",
 		Severity: "warning",
 	},
 	// ── Memory ─────────────────────────────────────────────────────────────────
 	{
 		Name:     "edac-ue",
 		Re:       mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
 		Category: "memory",
 		Severity: "warning",
 	},
 	{
 		Name:     "edac-ce",
 		Re:       mustPat(`(?i)EDAC.*[Cc]orrectable`),
 		Category: "memory",
 		Severity: "warning",
 	},
 }
 func mustPat(s string) *regexp.Regexp {
 	return regexp.MustCompile(s)
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -20,12 +20,13 @@ type GPUMetricRow struct {
 	MemUsagePct float64 `json:"mem_usage_pct"`
 	PowerW      float64 `json:"power_w"`
 	ClockMHz    float64 `json:"clock_mhz"`
 	MemClockMHz float64 `json:"mem_clock_mhz"`
 }
 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	args := []string{
-		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
+		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			continue
 		}
 		parts := strings.Split(line, ", ")
-		if len(parts) < 6 {
+		if len(parts) < 7 {
 			continue
 		}
 		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			MemUsagePct: parseGPUFloat(parts[3]),
 			PowerW:      parseGPUFloat(parts[4]),
 			ClockMHz:    parseGPUFloat(parts[5]),
 			MemClockMHz: parseGPUFloat(parts[6]),
 		})
 	}
 	return rows, nil
@@ -139,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
+	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
+		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
+			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
@@ -197,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	const PW = plotX2 - plotX1
 	const PH = plotY2 - plotY1
 	// Outer axes
-	const tempAxisX = 60  // temp axis line
+	const tempAxisX = 60   // temp axis line
 	const clockAxisX = 900 // clock axis line
 	colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
 	}
 	log("Verifying live medium now served from RAM...")
 	status := s.LiveBootSource()
 	if err := verifyInstallToRAMStatus(status); err != nil {
 		return err
 	}
 	log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
 	log("Done. Installation media can be safely disconnected.")
 	return nil
 }
 func verifyInstallToRAMStatus(status LiveBootSource) error {
 	if status.InRAM {
 		return nil
 	}
 	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
 }
 func describeLiveBootSource(status LiveBootSource) string {
 	source := strings.TrimSpace(status.Device)
 	if source == "" {
 		source = strings.TrimSpace(status.Source)
 	}
 	if source == "" {
 		source = "unknown source"
 	}
 	switch strings.TrimSpace(status.Kind) {
 	case "ram":
 		return "RAM"
 	case "usb":
 		return "USB (" + source + ")"
 	case "cdrom":
 		return "CD-ROM (" + source + ")"
 	case "disk":
 		return "disk (" + source + ")"
 	default:
 		return source
 	}
 }
 func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
 	in, err := os.Open(src)
 	if err != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -3,6 +3,8 @@ package platform
 import "testing"
 func TestInferLiveBootKind(t *testing.T) {
 	t.Parallel()
 	tests := []struct {
 		name       string
 		fsType     string
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
 		{name: "unknown", source: "overlay", want: "unknown"},
 	}
 	for _, tc := range tests {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
 			if got != tc.want {
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
 		})
 	}
 }
 func TestVerifyInstallToRAMStatus(t *testing.T) {
 	t.Parallel()
 	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
 		t.Fatalf("expected success for RAM-backed status, got %v", err)
 	}
 	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
 	if err == nil {
 		t.Fatal("expected verification failure when media is still on USB")
 	}
 	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
 		t.Fatalf("error=%q", got)
 	}
 }
 func TestDescribeLiveBootSource(t *testing.T) {
 	t.Parallel()
 	if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
 		t.Fatalf("got %q want RAM", got)
 	}
 	if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
 		t.Fatalf("got %q want /run/live/medium", got)
 	}
 }
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -15,6 +15,10 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
 	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
 	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
 	"nvvs",
 	"dcgmi",
 }
 // KilledProcess describes a process that was sent SIGKILL.
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+		satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
 		job,
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+	), logFunc)
 }
 func nvidiaStressArchivePrefix(loader string) string {
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
-				gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
 				if gpuCmd == nil {
 					return
 				}
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
 	}
 	cmd := exec.CommandContext(ctx, path, cmdArgs...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		}
 		return nil
 	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	if err := startLowPriorityCmd(cmd, 15); err != nil {
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
 // Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
-func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
+func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
 	switch strings.ToLower(vendor) {
 	case "amd":
-		return buildAMDGPUStressCmd(ctx)
+		return buildAMDGPUStressCmd(ctx, durSec)
 	case "nvidia":
-		return buildNvidiaGPUStressCmd(ctx)
+		return buildNvidiaGPUStressCmd(ctx, durSec)
 	}
 	return nil
 }
-func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	rvsArgs, err := resolveRVSCommand()
 	if err != nil {
 		return nil
 	}
 	rvsPath := rvsArgs[0]
-	cfg := `actions:
+	cfg := fmt.Sprintf(`actions:
 - name: gst_platform
  device: all
  module: gst
  parallel: true
-  duration: 86400000
+  duration: %d`, durSec*1000) + `
  copy_matrix: false
  target_stress: 90
  matrix_size_a: 8640
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
 	cfgFile := "/tmp/bee-platform-gst.conf"
 	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
 	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		}
 		return nil
 	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
 	return cmd
 }
-func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	path, err := satLookPath("bee-gpu-burn")
 	if err != nil {
 		path, err = satLookPath("bee-gpu-stress")
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
 	if err != nil {
 		return nil
 	}
-	cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
+	// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
 	// Process group kill via Setpgid+Cancel is kept as a safety net for cases
 	// where the context is cancelled early (user stop, parent timeout).
 	cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		}
 		return nil
 	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	case "nvidia":
 		tools = append(tools, s.CheckTools([]string{
 			"nvidia-smi",
 			"dcgmi",
 			"nv-hostengine",
 			"nvidia-bug-report.sh",
 			"bee-gpu-burn",
 			"bee-john-gpu-stress",
 			"bee-nccl-gpu-stress",
 			"all_reduce_perf",
 		})...)
 		tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
 	case "amd":
 		tool := ToolStatus{Name: "rocm-smi"}
 		if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,11 +158,37 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	return tools
 }
 func resolvedToolStatus(display string, candidates ...string) ToolStatus {
 	for _, candidate := range candidates {
 		path, err := exec.LookPath(candidate)
 		if err == nil {
 			return ToolStatus{Name: display, Path: path, OK: true}
 		}
 	}
 	return ToolStatus{Name: display}
 }
 func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
 	lsmodText := commandText("lsmod")
 	switch vendor {
 	case "nvidia":
 		if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
 			health.NvidiaGSPMode = strings.TrimSpace(string(raw))
 			if health.NvidiaGSPMode == "gsp-stuck" {
 				health.Issues = append(health.Issues, schema.RuntimeIssue{
 					Code:        "nvidia_gsp_stuck",
 					Severity:    "critical",
 					Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
 				})
 			} else if health.NvidiaGSPMode == "gsp-off" {
 				health.Issues = append(health.Issues, schema.RuntimeIssue{
 					Code:        "nvidia_gsp_disabled",
 					Severity:    "warning",
 					Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
 				})
 			}
 		}
 		health.DriverReady = strings.Contains(lsmodText, "nvidia ")
 		if !health.DriverReady {
 			health.Issues = append(health.Issues, schema.RuntimeIssue{
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -12,19 +12,20 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
 	"syscall"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
 	"time"
 )
 var (
-	satExecCommand = exec.Command
+	satExecCommand  = exec.Command
-	satLookPath    = exec.LookPath
+	satLookPath     = exec.LookPath
-	satGlob        = filepath.Glob
+	satGlob         = filepath.Glob
-	satStat        = os.Stat
+	satStat         = os.Stat
 	satFreeMemBytes = freeMemBytes
 	rocmSMIExecutableGlobs = []string{
 		"/opt/rocm/bin/rocm-smi",
@@ -38,6 +39,12 @@ var (
 		"/opt/rocm/bin/rvs",
 		"/opt/rocm-*/bin/rvs",
 	}
 	dcgmProfTesterCandidates = []string{
 		"dcgmproftester",
 		"dcgmproftester13",
 		"dcgmproftester12",
 		"dcgmproftester11",
 	}
 )
 // streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -76,15 +83,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
 // NvidiaGPU holds basic GPU info from nvidia-smi.
 type NvidiaGPU struct {
-	Index    int
+	Index    int    `json:"index"`
-	Name     string
+	Name     string `json:"name"`
-	MemoryMB int
+	MemoryMB int    `json:"memory_mb"`
 }
 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
 type AMDGPUInfo struct {
-	Index int
+	Index int    `json:"index"`
-	Name  string
+	Name  string `json:"name"`
 }
 // DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
@@ -256,6 +263,9 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
 			MemoryMB: memMB,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool {
 		return gpus[i].Index < gpus[j].Index
 	})
 	return gpus, nil
 }
@@ -268,13 +278,87 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
+	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-all-reduce-perf.log", cmd: []string{
+		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
 		}},
-	}, logFunc)
+	), logFunc)
 }
 func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
 		satJob{
 			name:       "03-dcgmproftester.log",
 			cmd:        profCmd,
 			env:        nvidiaVisibleDevicesEnv(selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-targeted-power.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-pulse-test.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-nvbandwidth.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
@@ -286,7 +370,68 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
 // gpuIndices: specific GPU indices to test (empty = all GPUs).
 // ctx cancellation kills the running job.
 func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
+	resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
 }
 func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-targeted-stress.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	if len(gpuIndices) > 0 {
 		return dedupeSortedIndices(gpuIndices), nil
 	}
 	all, err := listNvidiaGPUIndices()
 	if err != nil {
 		return nil, err
 	}
 	if len(all) == 0 {
 		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
 	}
 	return all, nil
 }
 func memoryStressSizeArg() string {
 	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
 		return fmt.Sprintf("%dM", mb)
 	}
 	availBytes := satFreeMemBytes()
 	if availBytes <= 0 {
 		return "80%"
 	}
 	availMB := availBytes / (1024 * 1024)
 	targetMB := (availMB * 2) / 3
 	if targetMB >= 256 {
 		targetMB = (targetMB / 256) * 256
 	}
 	if targetMB <= 0 {
 		return "80%"
 	}
 	return fmt.Sprintf("%dM", targetMB)
 }
 func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -304,11 +449,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
 	if seconds <= 0 {
 		seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
 	}
-	// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
+	// Base the default on current MemAvailable and keep headroom for the OS and
-	sizeArg := "80%"
+	// concurrent stressors so mixed burn runs do not trip the OOM killer.
-	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
+	sizeArg := memoryStressSizeArg()
 		sizeArg = fmt.Sprintf("%dM", mb)
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-stress-ng-vm.log", cmd: []string{
@@ -425,14 +568,24 @@ type satStats struct {
 	Unsupported int
 }
 func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
 	out := make([]satJob, 0, len(jobs)+1)
 	out = append(out, satJob{
 		name: "00-nvidia-smi-persistence-mode.log",
 		cmd:  []string{"nvidia-smi", "-pm", "1"},
 	})
 	out = append(out, jobs...)
 	return out
 }
 func nvidiaSATJobs() []satJob {
-	return []satJob{
+	return withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
+		satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
-		{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
+		satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
-	}
+	)
 }
 func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
@@ -447,11 +600,39 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 		}
 		diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
 	}
-	return []satJob{
+	return withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-dcgmi-diag.log", cmd: diagArgs},
+		satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
 	)
 }
 func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
 	args := []string{"dcgmi", "diag", "-r", name}
 	if durationSec > 0 {
 		args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
 	}
 	if len(gpuIndices) > 0 {
 		args = append(args, "-i", joinIndexList(gpuIndices))
 	}
 	return args
 }
 func normalizeNvidiaBurnDuration(durationSec int) int {
 	if durationSec <= 0 {
 		return 300
 	}
 	return durationSec
 }
 func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
 	if len(gpuIndices) == 0 {
 		return nil
 	}
 	return []string{
 		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
 		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
 	}
 }
@@ -493,6 +674,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
 			return "", writeErr
 		}
 		if ctx.Err() != nil {
 			return "", ctx.Err()
 		}
 		status, rc := classifySATResult(job.name, out, err)
 		stats.Add(status)
 		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
@@ -624,6 +808,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 	}
 	if strings.Contains(text, "unsupported") ||
 		strings.Contains(text, "not supported") ||
 		strings.Contains(text, "not found in path") ||
 		strings.Contains(text, "invalid opcode") ||
 		strings.Contains(text, "unknown command") ||
 		strings.Contains(text, "not implemented") ||
@@ -730,6 +915,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
 	return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
 }
 func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
 	for _, candidate := range dcgmProfTesterCandidates {
 		if path, err := satLookPath(candidate); err == nil {
 			return append([]string{path}, args...), nil
 		}
 	}
 	return nil, errors.New("dcgmproftester not found in PATH")
 }
 func ensureAMDRuntimeReady() error {
 	if _, err := os.Stat("/dev/kfd"); err == nil {
 		return nil
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -1,12 +1,14 @@
 package platform
 import (
 	"context"
 	"errors"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 )
 func TestStorageSATCommands(t *testing.T) {
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	if len(jobs) != 5 {
+	if len(jobs) != 6 {
-		t.Fatalf("jobs=%d want 5", len(jobs))
+		t.Fatalf("jobs=%d want 6", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
+	if got := jobs[0].cmd[0]; got != "nvidia-smi" {
 		t.Fatalf("preflight command=%q want nvidia-smi", got)
 	}
 	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
 		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
 	}
 	if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
 		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
 	}
-	if got := jobs[3].cmd[1]; got != "--output-file" {
+	if got := jobs[4].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
 	}
 }
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
 func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	got := jobs[4].cmd
+	got := jobs[5].cmd
 	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	}
 }
 func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
 	jobs := nvidiaDCGMJobs(3, []int{2, 0})
 	if len(jobs) != 5 {
 		t.Fatalf("jobs=%d want 5", len(jobs))
 	}
 	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
 		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
 	}
 	if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
 		t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
 	}
 }
 func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
 	t.Parallel()
@@ -162,6 +183,89 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
 	}
 }
 func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
 	t.Parallel()
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	got, err := resolveDCGMGPUIndices(nil)
 	if err != nil {
 		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
 	}
 	if want := "0,1,2"; joinIndexList(got) != want {
 		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
 	}
 }
 func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	t.Parallel()
 	got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
 	if err != nil {
 		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
 	}
 	if want := "1,3"; joinIndexList(got) != want {
 		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
 	}
 }
 func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
 	oldLookPath := satLookPath
 	satLookPath = func(file string) (string, error) {
 		switch file {
 		case "dcgmproftester13":
 			return "/usr/bin/dcgmproftester13", nil
 		default:
 			return "", exec.ErrNotFound
 		}
 	}
 	t.Cleanup(func() { satLookPath = oldLookPath })
 	cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
 	if err != nil {
 		t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
 	}
 	if len(cmd) != 4 {
 		t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
 	}
 	if cmd[0] != "/usr/bin/dcgmproftester13" {
 		t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
 	}
 }
 func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
 	want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
 	if len(cmd) != len(want) {
 		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
 	}
 	for i := range want {
 		if cmd[i] != want[i] {
 			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
 		}
 	}
 }
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
 		t.Fatalf("env len=%d want 2 (%v)", len(env), env)
 	}
 	if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
 		t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
 	}
 	if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
 		t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
 	}
 }
 func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
 	t.Parallel()
@@ -196,6 +300,37 @@ func TestEnvIntFallback(t *testing.T) {
 	}
 }
 func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
 	oldFreeMemBytes := satFreeMemBytes
 	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
 	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
 	if got := memoryStressSizeArg(); got != "65536M" {
 		t.Fatalf("sizeArg=%q want 65536M", got)
 	}
 }
 func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
 	oldFreeMemBytes := satFreeMemBytes
 	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
 	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
 	t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
 	if got := memoryStressSizeArg(); got != "4096M" {
 		t.Fatalf("sizeArg=%q want 4096M", got)
 	}
 }
 func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
 	oldFreeMemBytes := satFreeMemBytes
 	satFreeMemBytes = func() int64 { return 0 }
 	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
 	if got := memoryStressSizeArg(); got != "80%" {
 		t.Fatalf("sizeArg=%q want 80%%", got)
 	}
 }
 func TestClassifySATResult(t *testing.T) {
 	tests := []struct {
 		name   string
@@ -220,6 +355,38 @@ func TestClassifySATResult(t *testing.T) {
 	}
 }
 func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
 	dir := t.TempDir()
 	ctx, cancel := context.WithCancel(context.Background())
 	t.Cleanup(cancel)
 	done := make(chan struct{})
 	go func() {
 		time.Sleep(100 * time.Millisecond)
 		cancel()
 		close(done)
 	}()
 	archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
 		{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
 	}, nil)
 	<-done
 	if !errors.Is(err, context.Canceled) {
 		t.Fatalf("err=%v want context.Canceled", err)
 	}
 	if archive != "" {
 		t.Fatalf("archive=%q want empty", archive)
 	}
 	matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
 	if globErr != nil {
 		t.Fatalf("Glob error: %v", globErr)
 	}
 	if len(matches) != 0 {
 		t.Fatalf("archives=%v want none", matches)
 	}
 }
 func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -10,17 +10,30 @@ import (
 func (s *System) ListBeeServices() ([]string, error) {
 	seen := map[string]bool{}
 	var out []string
-	for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
+	for _, pattern := range []string{
 		"/etc/systemd/system/bee-*.service",
 		"/lib/systemd/system/bee-*.service",
 		"/etc/systemd/system/bee-*.timer",
 		"/lib/systemd/system/bee-*.timer",
 	} {
 		matches, err := filepath.Glob(pattern)
 		if err != nil {
 			return nil, err
 		}
 		for _, match := range matches {
-			name := strings.TrimSuffix(filepath.Base(match), ".service")
+			base := filepath.Base(match)
 			name := base
 			if strings.HasSuffix(base, ".service") {
 				name = strings.TrimSuffix(base, ".service")
 			}
 			// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
 			if strings.HasSuffix(name, "@") {
 				continue
 			}
 			// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
 			if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
 				continue
 			}
 			if !seen[name] {
 				seen[name] = true
 				out = append(out, name)
@@ -48,7 +61,9 @@ func (s *System) ServiceState(name string) string {
 }
 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
-	raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
+	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
 	return string(raw), err
 }
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
 }
 type RemovableTarget struct {
-	Device     string
+	Device     string `json:"device"`
-	FSType     string
+	FSType     string `json:"fs_type"`
-	Size       string
+	Size       string `json:"size"`
-	Label      string
+	Label      string `json:"label"`
-	Model      string
+	Model      string `json:"model"`
-	Mountpoint string
+	Mountpoint string `json:"mountpoint"`
 }
 type ToolStatus struct {
--- a/audit/internal/platform/types_test.go
+++ b/audit/internal/platform/types_test.go
@@ -0,0 +1,31 @@
 package platform
 import (
 	"encoding/json"
 	"strings"
 	"testing"
 )
 func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
 	t.Parallel()
 	data, err := json.Marshal(RemovableTarget{
 		Device: "/dev/sdb1",
 		FSType: "exfat",
 		Size:   "1.8T",
 		Label:  "USB",
 		Model:  "Flash",
 	})
 	if err != nil {
 		t.Fatalf("marshal: %v", err)
 	}
 	raw := string(data)
 	for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
 		if !strings.Contains(raw, key) {
 			t.Fatalf("json missing key %s: %s", key, raw)
 		}
 	}
 	if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
 		t.Fatalf("json still contains Go field names: %s", raw)
 	}
 }
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
 	ExportDir     string                 `json:"export_dir,omitempty"`
 	DriverReady   bool                   `json:"driver_ready,omitempty"`
 	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
 	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string                 `json:"network_status,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
 	"sort"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -21,13 +22,238 @@ import (
 )
 var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
 var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
 	if a == nil {
 		return nil, fmt.Errorf("app not configured")
 	}
 	return a.ListNvidiaGPUs()
 }
 // ── Job ID counter ────────────────────────────────────────────────────────────
 var jobCounter atomic.Uint64
-func newJobID(prefix string) string {
+func newJobID(_ string) string {
-	return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
+	start := int((jobCounter.Add(1) - 1) % 1000)
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	for offset := 0; offset < 1000; offset++ {
 		n := (start + offset) % 1000
 		id := fmt.Sprintf("TASK-%03d", n)
 		if !taskIDInUseLocked(id) {
 			return id
 		}
 	}
 	return fmt.Sprintf("TASK-%03d", start)
 }
 func taskIDInUseLocked(id string) bool {
 	for _, t := range globalQueue.tasks {
 		if t != nil && t.ID == id {
 			return true
 		}
 	}
 	return false
 }
 type taskRunResponse struct {
 	TaskID    string   `json:"task_id,omitempty"`
 	JobID     string   `json:"job_id,omitempty"`
 	TaskIDs   []string `json:"task_ids,omitempty"`
 	JobIDs    []string `json:"job_ids,omitempty"`
 	TaskCount int      `json:"task_count,omitempty"`
 }
 type nvidiaTaskSelection struct {
 	GPUIndices []int
 	Label      string
 }
 func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
 	if len(tasks) == 0 {
 		writeJSON(w, taskRunResponse{})
 		return
 	}
 	ids := make([]string, 0, len(tasks))
 	for _, t := range tasks {
 		if t == nil || strings.TrimSpace(t.ID) == "" {
 			continue
 		}
 		ids = append(ids, t.ID)
 	}
 	resp := taskRunResponse{TaskCount: len(ids)}
 	if len(ids) > 0 {
 		resp.TaskID = ids[0]
 		resp.JobID = ids[0]
 		resp.TaskIDs = ids
 		resp.JobIDs = ids
 	}
 	writeJSON(w, resp)
 }
 func shouldSplitHomogeneousNvidiaTarget(target string) bool {
 	switch strings.TrimSpace(target) {
 	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
 		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
 		"nvidia-bandwidth", "nvidia-stress":
 		return true
 	default:
 		return false
 	}
 }
 func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
 	if len(gpus) == 0 {
 		return nil, fmt.Errorf("no NVIDIA GPUs detected")
 	}
 	indexed := make(map[int]platform.NvidiaGPU, len(gpus))
 	allIndices := make([]int, 0, len(gpus))
 	for _, gpu := range gpus {
 		indexed[gpu.Index] = gpu
 		allIndices = append(allIndices, gpu.Index)
 	}
 	sort.Ints(allIndices)
 	selected := allIndices
 	if len(include) > 0 {
 		selected = make([]int, 0, len(include))
 		seen := make(map[int]struct{}, len(include))
 		for _, idx := range include {
 			if _, ok := indexed[idx]; !ok {
 				continue
 			}
 			if _, dup := seen[idx]; dup {
 				continue
 			}
 			seen[idx] = struct{}{}
 			selected = append(selected, idx)
 		}
 		sort.Ints(selected)
 	}
 	if len(exclude) > 0 {
 		skip := make(map[int]struct{}, len(exclude))
 		for _, idx := range exclude {
 			skip[idx] = struct{}{}
 		}
 		filtered := selected[:0]
 		for _, idx := range selected {
 			if _, ok := skip[idx]; ok {
 				continue
 			}
 			filtered = append(filtered, idx)
 		}
 		selected = filtered
 	}
 	if len(selected) == 0 {
 		return nil, fmt.Errorf("no NVIDIA GPUs selected")
 	}
 	modelGroups := make(map[string][]platform.NvidiaGPU)
 	modelOrder := make([]string, 0)
 	for _, idx := range selected {
 		gpu := indexed[idx]
 		model := strings.TrimSpace(gpu.Name)
 		if model == "" {
 			model = fmt.Sprintf("GPU %d", gpu.Index)
 		}
 		if _, ok := modelGroups[model]; !ok {
 			modelOrder = append(modelOrder, model)
 		}
 		modelGroups[model] = append(modelGroups[model], gpu)
 	}
 	sort.Slice(modelOrder, func(i, j int) bool {
 		left := modelGroups[modelOrder[i]]
 		right := modelGroups[modelOrder[j]]
 		if len(left) == 0 || len(right) == 0 {
 			return modelOrder[i] < modelOrder[j]
 		}
 		return left[0].Index < right[0].Index
 	})
 	var groups []nvidiaTaskSelection
 	var singles []nvidiaTaskSelection
 	for _, model := range modelOrder {
 		group := modelGroups[model]
 		sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
 		indices := make([]int, 0, len(group))
 		for _, gpu := range group {
 			indices = append(indices, gpu.Index)
 		}
 		if len(indices) >= 2 {
 			groups = append(groups, nvidiaTaskSelection{
 				GPUIndices: indices,
 				Label:      fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
 			})
 			continue
 		}
 		gpu := group[0]
 		singles = append(singles, nvidiaTaskSelection{
 			GPUIndices: []int{gpu.Index},
 			Label:      fmt.Sprintf("GPU %d — %s", gpu.Index, model),
 		})
 	}
 	return append(groups, singles...), nil
 }
 func joinTaskIndices(indices []int) string {
 	parts := make([]string, 0, len(indices))
 	for _, idx := range indices {
 		parts = append(parts, fmt.Sprintf("%d", idx))
 	}
 	return strings.Join(parts, ",")
 }
 func formatSplitTaskName(baseName, selectionLabel string) string {
 	baseName = strings.TrimSpace(baseName)
 	selectionLabel = strings.TrimSpace(selectionLabel)
 	if baseName == "" {
 		return selectionLabel
 	}
 	if selectionLabel == "" {
 		return baseName
 	}
 	return baseName + " (" + selectionLabel + ")"
 }
 func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
 	if !shouldSplitHomogeneousNvidiaTarget(target) {
 		t := &Task{
 			ID:        newJobID(idPrefix),
 			Name:      baseName,
 			Target:    target,
 			Priority:  priority,
 			Status:    TaskPending,
 			CreatedAt: createdAt,
 			params:    params,
 		}
 		return []*Task{t}, nil
 	}
 	gpus, err := apiListNvidiaGPUs(appRef)
 	if err != nil {
 		return nil, err
 	}
 	selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
 	if err != nil {
 		return nil, err
 	}
 	tasks := make([]*Task, 0, len(selections))
 	for _, selection := range selections {
 		taskParamsCopy := params
 		taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
 		taskParamsCopy.ExcludeGPUIndices = nil
 		displayName := formatSplitTaskName(baseName, selection.Label)
 		taskParamsCopy.DisplayName = displayName
 		tasks = append(tasks, &Task{
 			ID:        newJobID(idPrefix),
 			Name:      displayName,
 			Target:    target,
 			Priority:  priority,
 			Status:    TaskPending,
 			CreatedAt: createdAt,
 			params:    taskParamsCopy,
 		})
 	}
 	return tasks, nil
 }
 // ── SSE helpers ───────────────────────────────────────────────────────────────
@@ -110,6 +336,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
 	scanDone := make(chan error, 1)
 	go func() {
 		defer func() {
 			if rec := recover(); rec != nil {
 				scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
 			}
 		}()
 		scanner := bufio.NewScanner(pr)
 		scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
 		for scanner.Scan() {
@@ -202,31 +433,84 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		}
 		name := taskDisplayName(target, body.Profile, body.Loader)
 		t := &Task{
 			ID:        newJobID("sat-" + target),
 			Name:      name,
 			Target:    target,
 			Status:    TaskPending,
 			CreatedAt: time.Now(),
 			params: taskParams{
 				Duration:           body.Duration,
 				DiagLevel:          body.DiagLevel,
 				GPUIndices:         body.GPUIndices,
 				ExcludeGPUIndices:  body.ExcludeGPUIndices,
 				Loader:             body.Loader,
 				BurnProfile:        body.Profile,
 				DisplayName:        body.DisplayName,
 				PlatformComponents: body.PlatformComponents,
 			},
 		}
 		if strings.TrimSpace(body.DisplayName) != "" {
-			t.Name = body.DisplayName
+			name = body.DisplayName
 		}
-		globalQueue.enqueue(t)
+		params := taskParams{
-		writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+			Duration:           body.Duration,
 			DiagLevel:          body.DiagLevel,
 			GPUIndices:         body.GPUIndices,
 			ExcludeGPUIndices:  body.ExcludeGPUIndices,
 			Loader:             body.Loader,
 			BurnProfile:        body.Profile,
 			DisplayName:        body.DisplayName,
 			PlatformComponents: body.PlatformComponents,
 		}
 		tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
 		if err != nil {
 			writeError(w, http.StatusBadRequest, err.Error())
 			return
 		}
 		for _, t := range tasks {
 			globalQueue.enqueue(t)
 		}
 		writeTaskRunResponse(w, tasks)
 	}
 }
 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
 	var body struct {
 		Profile           string `json:"profile"`
 		SizeMB            int    `json:"size_mb"`
 		GPUIndices        []int  `json:"gpu_indices"`
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
 		ParallelGPUs      *bool  `json:"parallel_gpus"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
 		if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
 			writeError(w, http.StatusBadRequest, "invalid request body")
 			return
 		}
 	}
 	runNCCL := true
 	if body.RunNCCL != nil {
 		runNCCL = *body.RunNCCL
 	}
 	parallelGPUs := false
 	if body.ParallelGPUs != nil {
 		parallelGPUs = *body.ParallelGPUs
 	}
 	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
 		name = body.DisplayName
 	}
 	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
 		GPUIndices:        body.GPUIndices,
 		ExcludeGPUIndices: body.ExcludeGPUIndices,
 		SizeMB:            body.SizeMB,
 		BenchmarkProfile:  body.Profile,
 		RunNCCL:           runNCCL,
 		ParallelGPUs:      parallelGPUs,
 		DisplayName:       body.DisplayName,
 	}, name, h.opts.App, "benchmark-nvidia")
 	if err != nil {
 		writeError(w, http.StatusBadRequest, err.Error())
 		return
 	}
 	for _, t := range tasks {
 		globalQueue.enqueue(t)
 	}
 	writeTaskRunResponse(w, tasks)
 }
 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
 	id := r.URL.Query().Get("job_id")
 	if id == "" {
@@ -330,11 +614,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
 		return
 	}
 	result, err := h.opts.App.ServiceActionResult(req.Name, action)
 	status := "ok"
 	if err != nil {
-		writeError(w, http.StatusInternalServerError, err.Error())
+		status = "error"
 		return
 	}
-	writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
+	// Always return 200 with output so the frontend can display the actual
 	// systemctl error message instead of a generic "exit status 1".
 	writeJSON(w, map[string]string{"status": status, "output": result.Body})
 }
 // ── Network ───────────────────────────────────────────────────────────────────
@@ -486,6 +772,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
 // ── GPU presence ──────────────────────────────────────────────────────────────
 func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
 	gpus, err := h.opts.App.ListNvidiaGPUs()
 	if err != nil {
 		writeError(w, http.StatusInternalServerError, err.Error())
 		return
 	}
 	if gpus == nil {
 		gpus = []platform.NvidiaGPU{}
 	}
 	writeJSON(w, gpus)
 }
 func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -511,14 +813,33 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
 	_, amdErr := os.Stat("/dev/kfd")
 	nvidiaUp := nvidiaErr == nil
 	amdUp := amdErr == nil
 	_, dcgmErr := exec.LookPath("dcgmi")
 	_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
 	_, johnErr := exec.LookPath("bee-john-gpu-stress")
 	_, beeBurnErr := exec.LookPath("bee-gpu-burn")
 	_, nvBandwidthErr := exec.LookPath("nvbandwidth")
 	profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
 	writeJSON(w, []toolEntry{
-		{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
+		{ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
-		{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
+		{ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
-		{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
+		{ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
 		{ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
 		{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
 		{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
 		{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
 		{ID: "rvs", Available: amdUp, Vendor: "amd"},
 	})
 }
 func lookPathAny(names ...string) error {
 	for _, name := range names {
 		if _, err := exec.LookPath(name); err == nil {
 			return nil
 		}
 	}
 	return exec.ErrNotFound
 }
 // ── System ────────────────────────────────────────────────────────────────────
 func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -557,7 +878,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
 var standardTools = []string{
 	"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
-	"nvidia-smi", "memtester", "stress-ng", "nvtop",
+	"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
 	"mstflint", "qrencode",
 }
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -1,6 +1,7 @@
 package webui
 import (
 	"encoding/json"
 	"net/http/httptest"
 	"strings"
 	"testing"
@@ -64,6 +65,141 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	}
 }
 func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 3, Name: "NVIDIA H100 PCIe"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 1 {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
 	if task.Target != "nvidia-benchmark" {
 		t.Fatalf("target=%q want nvidia-benchmark", task.Target)
 	}
 	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
 		t.Fatalf("gpu indices=%v want [1 3]", got)
 	}
 	if task.params.RunNCCL {
 		t.Fatal("RunNCCL should reflect explicit false from request")
 	}
 }
 func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 0, Name: "NVIDIA H100 PCIe"},
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 2, Name: "NVIDIA H200 NVL"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	var resp taskRunResponse
 	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
 		t.Fatalf("decode response: %v", err)
 	}
 	if len(resp.TaskIDs) != 2 {
 		t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 2 {
 		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
 	}
 	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
 		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
 	}
 	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
 		t.Fatalf("task[1] gpu indices=%v want [2]", got)
 	}
 }
 func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 0, Name: "NVIDIA H100 PCIe"},
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 2, Name: "NVIDIA H200 NVL"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
 	rec := httptest.NewRecorder()
 	h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 2 {
 		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
 	}
 	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
 		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
 	}
 	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
 		t.Fatalf("task[1] gpu indices=%v want [2]", got)
 	}
 }
 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -0,0 +1,773 @@
 package webui
 import (
 	"fmt"
 	"math"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"bee/audit/internal/platform"
 )
 type chartTimelineSegment struct {
 	Start  time.Time
 	End    time.Time
 	Active bool
 }
 type chartScale struct {
 	Min   float64
 	Max   float64
 	Ticks []float64
 }
 type chartLayout struct {
 	Width      int
 	Height     int
 	PlotLeft   int
 	PlotRight  int
 	PlotTop    int
 	PlotBottom int
 }
 type metricChartSeries struct {
 	Name      string
 	AxisTitle string
 	Color     string
 	Values    []float64
 }
 var metricChartPalette = []string{
 	"#5794f2",
 	"#73bf69",
 	"#f2cc0c",
 	"#ff9830",
 	"#f2495c",
 	"#b877d9",
 	"#56d2f7",
 	"#8ab8ff",
 	"#9adf8f",
 	"#ffbe5c",
 }
 var gpuLabelCache struct {
 	mu       sync.Mutex
 	loadedAt time.Time
 	byIndex  map[int]string
 }
 func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	if pointCount == 0 {
 		pointCount = 1
 		labels = []string{""}
 		times = []time.Time{time.Time{}}
 	}
 	if len(labels) < pointCount {
 		padded := make([]string, pointCount)
 		copy(padded, labels)
 		labels = padded
 	}
 	if len(times) < pointCount {
 		times = synthesizeChartTimes(times, pointCount)
 	}
 	for i := range datasets {
 		if len(datasets[i]) == 0 {
 			datasets[i] = make([]float64, pointCount)
 		}
 	}
 	statsLabel := chartStatsLabel(datasets)
 	legendItems := []metricChartSeries{}
 	for i, name := range names {
 		color := metricChartPalette[i%len(metricChartPalette)]
 		values := make([]float64, pointCount)
 		if i < len(datasets) {
 			copy(values, coalesceDataset(datasets[i], pointCount))
 		}
 		legendItems = append(legendItems, metricChartSeries{
 			Name:   name,
 			Color:  color,
 			Values: values,
 		})
 	}
 	scale := singleAxisChartScale(datasets, yMin, yMax)
 	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
 	start, end := chartTimeBounds(times)
 	var b strings.Builder
 	writeSVGOpen(&b, layout.Width, layout.Height)
 	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scale)
 	writeTimelineBoundaries(&b, layout, start, end, timeline)
 	writePlotBorder(&b, layout)
 	writeSingleAxisY(&b, layout, scale)
 	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
 	for _, item := range legendItems {
 		writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
 	}
 	writeLegend(&b, layout, legendItems)
 	writeSVGClose(&b)
 	return []byte(b.String()), nil
 }
 func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
 	temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 	power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 	coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 	if temp == nil && power == nil && coreClock == nil {
 		return nil, false, nil
 	}
 	labels := sampleTimeLabels(samples)
 	times := sampleTimes(samples)
 	svg, err := drawGPUOverviewChartSVG(
 		gpuDisplayLabel(idx)+" Overview",
 		labels,
 		times,
 		[]metricChartSeries{
 			{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
 			{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
 			{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
 		},
 		timeline,
 	)
 	if err != nil {
 		return nil, false, err
 	}
 	return svg, true, nil
 }
 func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
 	if len(series) != 3 {
 		return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
 	}
 	const (
 		width      = 1400
 		height     = 840
 		plotLeft   = 180
 		plotRight  = 1220
 		plotTop    = 96
 		plotBottom = 660
 	)
 	const (
 		leftOuterAxis  = 72
 		leftInnerAxis  = 132
 		rightInnerAxis = 1268
 	)
 	layout := chartLayout{
 		Width:      width,
 		Height:     height,
 		PlotLeft:   plotLeft,
 		PlotRight:  plotRight,
 		PlotTop:    plotTop,
 		PlotBottom: plotBottom,
 	}
 	axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	if pointCount == 0 {
 		pointCount = 1
 		labels = []string{""}
 		times = []time.Time{time.Time{}}
 	}
 	if len(labels) < pointCount {
 		padded := make([]string, pointCount)
 		copy(padded, labels)
 		labels = padded
 	}
 	if len(times) < pointCount {
 		times = synthesizeChartTimes(times, pointCount)
 	}
 	for i := range series {
 		if len(series[i].Values) == 0 {
 			series[i].Values = make([]float64, pointCount)
 		}
 	}
 	scales := make([]chartScale, len(series))
 	for i := range series {
 		min, max := chartSeriesBounds(series[i].Values)
 		ticks := chartNiceTicks(min, max, 8)
 		scales[i] = chartScale{
 			Min:   ticks[0],
 			Max:   ticks[len(ticks)-1],
 			Ticks: ticks,
 		}
 	}
 	start, end := chartTimeBounds(times)
 	var b strings.Builder
 	writeSVGOpen(&b, width, height)
 	writeChartFrame(&b, title, "", width, height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scales[0])
 	writeTimelineBoundaries(&b, layout, start, end, timeline)
 	writePlotBorder(&b, layout)
 	for i, axisLineX := range axisX {
 		fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
 			axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
 		fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
 			axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
 		for _, tick := range scales[i].Ticks {
 			y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
 			label := sanitizeChartText(chartYAxisNumber(tick))
 			if i < 2 {
 				fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
 					axisLineX, y, axisLineX+6, y, series[i].Color)
 				fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
 					axisLineX-8, y, series[i].Color, label)
 				continue
 			}
 			fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
 				axisLineX, y, axisLineX-6, y, series[i].Color)
 			fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
 				axisLineX+8, y, series[i].Color, label)
 		}
 	}
 	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
 	for i := range series {
 		writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
 	}
 	writeLegend(&b, layout, series)
 	writeSVGClose(&b)
 	return []byte(b.String()), nil
 }
 func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
 	if len(samples) == 0 {
 		return nil
 	}
 	times := sampleTimes(samples)
 	start, end := chartTimeBounds(times)
 	if start.IsZero() || end.IsZero() {
 		return nil
 	}
 	return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
 }
 func snapshotTaskHistory() []Task {
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	out := make([]Task, len(globalQueue.tasks))
 	for i, t := range globalQueue.tasks {
 		out[i] = *t
 	}
 	return out
 }
 func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
 	if start.IsZero() || end.IsZero() {
 		return nil
 	}
 	if end.Before(start) {
 		start, end = end, start
 	}
 	type interval struct {
 		start time.Time
 		end   time.Time
 	}
 	active := make([]interval, 0, len(tasks))
 	for _, task := range tasks {
 		if task.StartedAt == nil {
 			continue
 		}
 		intervalStart := task.StartedAt.UTC()
 		intervalEnd := now.UTC()
 		if task.DoneAt != nil {
 			intervalEnd = task.DoneAt.UTC()
 		}
 		if !intervalEnd.After(intervalStart) {
 			continue
 		}
 		if intervalEnd.Before(start) || intervalStart.After(end) {
 			continue
 		}
 		if intervalStart.Before(start) {
 			intervalStart = start
 		}
 		if intervalEnd.After(end) {
 			intervalEnd = end
 		}
 		active = append(active, interval{start: intervalStart, end: intervalEnd})
 	}
 	sort.Slice(active, func(i, j int) bool {
 		if active[i].start.Equal(active[j].start) {
 			return active[i].end.Before(active[j].end)
 		}
 		return active[i].start.Before(active[j].start)
 	})
 	merged := make([]interval, 0, len(active))
 	for _, span := range active {
 		if len(merged) == 0 {
 			merged = append(merged, span)
 			continue
 		}
 		last := &merged[len(merged)-1]
 		if !span.start.After(last.end) {
 			if span.end.After(last.end) {
 				last.end = span.end
 			}
 			continue
 		}
 		merged = append(merged, span)
 	}
 	segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
 	cursor := start
 	for _, span := range merged {
 		if span.start.After(cursor) {
 			segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
 		}
 		segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
 		cursor = span.end
 	}
 	if cursor.Before(end) {
 		segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
 	}
 	if len(segments) == 0 {
 		segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
 	}
 	return segments
 }
 func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
 	times := make([]time.Time, 0, len(samples))
 	for _, sample := range samples {
 		times = append(times, sample.Timestamp)
 	}
 	return times
 }
 func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
 	min, max := 0.0, 1.0
 	if yMin != nil && yMax != nil {
 		min, max = *yMin, *yMax
 	} else {
 		min, max = chartSeriesBounds(flattenDatasets(datasets))
 		if yMin != nil {
 			min = *yMin
 		}
 		if yMax != nil {
 			max = *yMax
 		}
 	}
 	ticks := chartNiceTicks(min, max, 8)
 	return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
 }
 func flattenDatasets(datasets [][]float64) []float64 {
 	total := 0
 	for _, ds := range datasets {
 		total += len(ds)
 	}
 	out := make([]float64, 0, total)
 	for _, ds := range datasets {
 		out = append(out, ds...)
 	}
 	return out
 }
 func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
 	legendRows := 0
 	if chartLegendVisible(seriesCount) && seriesCount > 0 {
 		cols := 4
 		if seriesCount < cols {
 			cols = seriesCount
 		}
 		legendRows = (seriesCount + cols - 1) / cols
 	}
 	legendHeight := 0
 	if legendRows > 0 {
 		legendHeight = legendRows*24 + 24
 	}
 	return chartLayout{
 		Width:      1400,
 		Height:     canvasHeight,
 		PlotLeft:   96,
 		PlotRight:  1352,
 		PlotTop:    72,
 		PlotBottom: canvasHeight - 60 - legendHeight,
 	}
 }
 func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
 	if len(times) == 0 {
 		return time.Time{}, time.Time{}
 	}
 	start := times[0].UTC()
 	end := start
 	for _, ts := range times[1:] {
 		t := ts.UTC()
 		if t.Before(start) {
 			start = t
 		}
 		if t.After(end) {
 			end = t
 		}
 	}
 	return start, end
 }
 func synthesizeChartTimes(times []time.Time, count int) []time.Time {
 	if count <= 0 {
 		return nil
 	}
 	if len(times) == count {
 		return times
 	}
 	if len(times) == 1 {
 		out := make([]time.Time, count)
 		for i := range out {
 			out[i] = times[0].Add(time.Duration(i) * time.Minute)
 		}
 		return out
 	}
 	base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
 	out := make([]time.Time, count)
 	for i := range out {
 		out[i] = base.Add(time.Duration(i) * time.Minute)
 	}
 	return out
 }
 func writeSVGOpen(b *strings.Builder, width, height int) {
 	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
 }
 func writeSVGClose(b *strings.Builder) {
 	b.WriteString("</svg>\n")
 }
 func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
 	fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
 	fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
 		width/2, sanitizeChartText(title))
 	if strings.TrimSpace(subtitle) != "" {
 		fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
 			width/2, sanitizeChartText(subtitle))
 	}
 }
 func writePlotBorder(b *strings.Builder, layout chartLayout) {
 	fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
 		layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
 }
 func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
 	b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
 	for _, tick := range scale.Ticks {
 		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
 		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
 			layout.PlotLeft, y, layout.PlotRight, y)
 	}
 	b.WriteString(`</g>` + "\n")
 }
 func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
 	if pointCount <= 0 {
 		return
 	}
 	start, end := chartTimeBounds(times)
 	b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
 	for _, idx := range gpuChartLabelIndices(pointCount, target) {
 		ts := chartPointTime(times, idx)
 		x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
 		fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
 			x, layout.PlotTop, x, layout.PlotBottom)
 	}
 	b.WriteString(`</g>` + "\n")
 }
 func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
 	fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
 		layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
 	for _, tick := range scale.Ticks {
 		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
 		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
 			layout.PlotLeft, y, layout.PlotLeft-6, y)
 		fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
 			layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
 	}
 }
 func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
 	for _, idx := range gpuChartLabelIndices(pointCount, target) {
 		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
 		label := ""
 		if idx < len(labels) {
 			label = labels[idx]
 		}
 		fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
 	}
 	b.WriteString(`</g>` + "\n")
 	fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
 		(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
 }
 func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
 	if len(values) == 0 {
 		return
 	}
 	var points strings.Builder
 	for idx, value := range values {
 		if idx > 0 {
 			points.WriteByte(' ')
 		}
 		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
 		points.String(), color)
 	if len(values) == 1 {
 		x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
 		fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
 		return
 	}
 	peakIdx := 0
 	peakValue := values[0]
 	for idx, value := range values[1:] {
 		if value >= peakValue {
 			peakIdx = idx + 1
 			peakValue = value
 		}
 	}
 	x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
 	y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
 	fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
 	fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
 		x, y-10, x-5, y-18, x+5, y-18, color)
 }
 func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
 	if !chartLegendVisible(len(series)) || len(series) == 0 {
 		return
 	}
 	cols := 4
 	if len(series) < cols {
 		cols = len(series)
 	}
 	cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
 	baseY := layout.PlotBottom + 74
 	for i, item := range series {
 		row := i / cols
 		col := i % cols
 		x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
 		y := float64(baseY + row*24)
 		fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
 			x, y, x+28, y, item.Color)
 		fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
 			x+38, y+4, sanitizeChartText(item.Name))
 	}
 }
 func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
 	if len(segments) == 0 {
 		return
 	}
 	b.WriteString(`<g data-role="timeline-overlay">` + "\n")
 	for _, segment := range segments {
 		if segment.Active || !segment.End.After(segment.Start) {
 			continue
 		}
 		x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
 		x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
 		fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
 			x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
 	}
 	b.WriteString(`</g>` + "\n")
 }
 func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
 	if len(segments) == 0 {
 		return
 	}
 	seen := map[int]bool{}
 	b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
 	for i, segment := range segments {
 		if i > 0 {
 			x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
 			if !seen[x] {
 				seen[x] = true
 				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
 			}
 		}
 		if i < len(segments)-1 {
 			x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
 			if !seen[x] {
 				seen[x] = true
 				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
 			}
 		}
 	}
 	b.WriteString(`</g>` + "\n")
 }
 func chartXForTime(ts, start, end time.Time, left, right int) float64 {
 	if !end.After(start) {
 		return float64(left+right) / 2
 	}
 	if ts.Before(start) {
 		ts = start
 	}
 	if ts.After(end) {
 		ts = end
 	}
 	ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
 	return float64(left) + ratio*float64(right-left)
 }
 func chartPointTime(times []time.Time, idx int) time.Time {
 	if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
 		return times[idx].UTC()
 	}
 	if len(times) > 0 && !times[0].IsZero() {
 		return times[0].UTC().Add(time.Duration(idx) * time.Minute)
 	}
 	return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
 }
 func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
 	if scale.Max <= scale.Min {
 		return float64(plotTop+plotBottom) / 2
 	}
 	return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
 }
 func chartSeriesBounds(values []float64) (float64, float64) {
 	if len(values) == 0 {
 		return 0, 1
 	}
 	min, max := values[0], values[0]
 	for _, value := range values[1:] {
 		if value < min {
 			min = value
 		}
 		if value > max {
 			max = value
 		}
 	}
 	if min == max {
 		if max == 0 {
 			return 0, 1
 		}
 		pad := math.Abs(max) * 0.1
 		if pad == 0 {
 			pad = 1
 		}
 		min -= pad
 		max += pad
 	}
 	if min > 0 {
 		pad := (max - min) * 0.2
 		if pad == 0 {
 			pad = max * 0.1
 		}
 		min -= pad
 		if min < 0 {
 			min = 0
 		}
 		max += pad
 	}
 	return min, max
 }
 func chartNiceTicks(min, max float64, target int) []float64 {
 	if min == max {
 		max = min + 1
 	}
 	span := max - min
 	step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
 	for _, factor := range []float64{1, 2, 5, 10} {
 		if span/(factor*step) <= float64(target)*1.5 {
 			step = factor * step
 			break
 		}
 	}
 	low := math.Floor(min/step) * step
 	high := math.Ceil(max/step) * step
 	var ticks []float64
 	for value := low; value <= high+step*0.001; value += step {
 		ticks = append(ticks, math.Round(value*1e9)/1e9)
 	}
 	return ticks
 }
 func valueClamp(value float64, scale chartScale) float64 {
 	if value < scale.Min {
 		return scale.Min
 	}
 	if value > scale.Max {
 		return scale.Max
 	}
 	return value
 }
 func chartStatsLabel(datasets [][]float64) string {
 	mn, avg, mx := globalStats(datasets)
 	if mx <= 0 && avg <= 0 && mn <= 0 {
 		return ""
 	}
 	return fmt.Sprintf("min %s   avg %s   max %s",
 		chartLegendNumber(mn),
 		chartLegendNumber(avg),
 		chartLegendNumber(mx),
 	)
 }
 func gpuDisplayLabel(idx int) string {
 	if name := gpuModelNameByIndex(idx); name != "" {
 		return fmt.Sprintf("GPU %d — %s", idx, name)
 	}
 	return fmt.Sprintf("GPU %d", idx)
 }
 func gpuModelNameByIndex(idx int) string {
 	now := time.Now()
 	gpuLabelCache.mu.Lock()
 	if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
 		gpuLabelCache.loadedAt = now
 		gpuLabelCache.byIndex = loadGPUModelNames()
 	}
 	name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
 	gpuLabelCache.mu.Unlock()
 	return name
 }
 func loadGPUModelNames() map[int]string {
 	out := map[int]string{}
 	gpus, err := platform.New().ListNvidiaGPUs()
 	if err != nil {
 		return out
 	}
 	for _, gpu := range gpus {
 		name := strings.TrimSpace(gpu.Name)
 		if name != "" {
 			out[gpu.Index] = name
 		}
 	}
 	return out
 }
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -9,13 +9,14 @@ import (
 // jobState holds the output lines and completion status of an async job.
 type jobState struct {
-	lines   []string
+	lines        []string
-	done    bool
+	done         bool
-	err     string
+	err          string
-	mu      sync.Mutex
+	mu           sync.Mutex
-	subs    []chan string
+	subs         []chan string
-	cancel  func() // optional cancel function; nil if job is not cancellable
+	cancel       func() // optional cancel function; nil if job is not cancellable
-	logPath string
+	logPath      string
 	serialPrefix string
 }
 // abort cancels the job if it has a cancel function and is not yet done.
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
 	if j.logPath != "" {
 		appendJobLog(j.logPath, line)
 	}
 	if j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
 	}
 	for _, ch := range j.subs {
 		select {
 		case ch <- line:
@@ -84,12 +88,12 @@ func (m *jobManager) create(id string) *jobState {
 	j := &jobState{}
 	m.jobs[id] = j
 	// Schedule cleanup after 30 minutes
-	go func() {
+	goRecoverOnce("job cleanup", func() {
 		time.Sleep(30 * time.Minute)
 		m.mu.Lock()
 		delete(m.jobs, id)
 		m.mu.Unlock()
-	}()
+	})
 	return j
 }
@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
 	return j, ok
 }
-func newTaskJobState(logPath string) *jobState {
+func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	j := &jobState{logPath: logPath}
 	if len(serialPrefix) > 0 {
 		j.serialPrefix = serialPrefix[0]
 	}
 	if logPath == "" {
 		return j
 	}
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -0,0 +1,242 @@
 package webui
 import (
 	"bufio"
 	"io"
 	"log/slog"
 	"os"
 	"strings"
 	"sync"
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 // kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
 // It supports multiple concurrent SAT tasks: a shared event window is open
 // while any SAT task is running, and flushed when all tasks complete.
 type kmsgWatcher struct {
 	mu          sync.Mutex
 	activeCount int // number of in-flight SAT tasks
 	window      *kmsgWindow
 	statusDB    *app.ComponentStatusDB
 }
 type kmsgWindow struct {
 	targets   []string // SAT targets running concurrently
 	startedAt time.Time
 	seen      map[kmsgEventKey]bool
 	events    []kmsgEvent
 }
 type kmsgEventKey struct {
 	id       string // BDF or device name
 	category string
 }
 type kmsgEvent struct {
 	timestamp time.Time
 	raw       string
 	ids       []string // BDF addresses or device names extracted
 	category  string
 }
 func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
 	return &kmsgWatcher{statusDB: statusDB}
 }
 // start launches the background kmsg reading goroutine.
 func (w *kmsgWatcher) start() {
 	goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
 }
 func (w *kmsgWatcher) run() {
 	for {
 		f, err := os.Open("/dev/kmsg")
 		if err != nil {
 			slog.Warn("kmsg watcher unavailable", "err", err)
 			time.Sleep(30 * time.Second)
 			continue
 		}
 		// Best-effort seek to end so we only capture events from now forward.
 		_, _ = f.Seek(0, io.SeekEnd)
 		scanner := bufio.NewScanner(f)
 		scanner.Buffer(make([]byte, 64*1024), 64*1024)
 		for scanner.Scan() {
 			line := scanner.Text()
 			evt, ok := parseKmsgLine(line)
 			if !ok {
 				continue
 			}
 			w.mu.Lock()
 			if w.window != nil {
 				w.recordEvent(evt)
 			}
 			w.mu.Unlock()
 		}
 		if err := scanner.Err(); err != nil {
 			slog.Warn("kmsg watcher stopped", "err", err)
 		}
 		_ = f.Close()
 		time.Sleep(2 * time.Second)
 	}
 }
 // recordEvent appends evt to the active window, deduplicating by (id, category).
 // Must be called with w.mu held.
 func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
 	if len(evt.ids) == 0 {
 		key := kmsgEventKey{id: "", category: evt.category}
 		if !w.window.seen[key] {
 			w.window.seen[key] = true
 			w.window.events = append(w.window.events, evt)
 		}
 		return
 	}
 	for _, id := range evt.ids {
 		key := kmsgEventKey{id: id, category: evt.category}
 		if !w.window.seen[key] {
 			w.window.seen[key] = true
 			w.window.events = append(w.window.events, evt)
 		}
 	}
 }
 // NotifyTaskStarted increments the active task counter and opens a shared event window
 // if this is the first task starting.
 func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	if w.activeCount == 0 {
 		w.window = &kmsgWindow{
 			startedAt: time.Now(),
 			seen:      make(map[kmsgEventKey]bool),
 		}
 	}
 	w.activeCount++
 	if w.window != nil {
 		w.window.targets = append(w.window.targets, target)
 	}
 }
 // NotifyTaskFinished decrements the active task counter. When all tasks finish,
 // it flushes the accumulated events to the status DB.
 func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
 	w.mu.Lock()
 	w.activeCount--
 	var window *kmsgWindow
 	if w.activeCount <= 0 {
 		w.activeCount = 0
 		window = w.window
 		w.window = nil
 	}
 	w.mu.Unlock()
 	if window == nil || len(window.events) == 0 {
 		return
 	}
 	goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
 }
 func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
 	if w.statusDB == nil {
 		return
 	}
 	source := "watchdog:kmsg"
 	// Collect unique component keys from events.
 	seen := map[string]string{} // componentKey → first raw line
 	for _, evt := range window.events {
 		if len(evt.ids) == 0 {
 			// MCE or un-identified error.
 			key := "cpu:all"
 			if evt.category == "memory" {
 				key = "memory:all"
 			}
 			if _, exists := seen[key]; !exists {
 				seen[key] = evt.raw
 			}
 			continue
 		}
 		for _, id := range evt.ids {
 			var key string
 			switch evt.category {
 			case "gpu", "pcie":
 				key = "pcie:" + normalizeBDF(id)
 			case "storage":
 				key = "storage:" + id
 			default:
 				key = "pcie:" + normalizeBDF(id)
 			}
 			if _, exists := seen[key]; !exists {
 				seen[key] = evt.raw
 			}
 		}
 	}
 	for key, detail := range seen {
 		detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
 		w.statusDB.Record(key, source, "Warning", detail)
 	}
 }
 // parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
 // any pattern in platform.HardwareErrorPatterns.
 // kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
 func parseKmsgLine(raw string) (kmsgEvent, bool) {
 	msg := raw
 	if idx := strings.Index(raw, ";"); idx >= 0 {
 		msg = strings.TrimSpace(raw[idx+1:])
 	}
 	if msg == "" {
 		return kmsgEvent{}, false
 	}
 	for _, p := range platform.HardwareErrorPatterns {
 		m := p.Re.FindStringSubmatch(msg)
 		if m == nil {
 			continue
 		}
 		evt := kmsgEvent{
 			timestamp: time.Now(),
 			raw:       msg,
 			category:  p.Category,
 		}
 		if p.BDFGroup > 0 && p.BDFGroup < len(m) {
 			evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
 		}
 		if p.DevGroup > 0 && p.DevGroup < len(m) {
 			evt.ids = append(evt.ids, m[p.DevGroup])
 		}
 		return evt, true
 	}
 	return kmsgEvent{}, false
 }
 // normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
 func normalizeBDF(bdf string) string {
 	bdf = strings.ToLower(strings.TrimSpace(bdf))
 	if strings.Count(bdf, ":") == 1 {
 		return "0000:" + bdf
 	}
 	return bdf
 }
 func truncate(s string, max int) string {
 	if len(s) <= max {
 		return s
 	}
 	return s[:max] + "..."
 }
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
 	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
 		return true
 	}
 	return false
 }
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"sort"
 	"strconv"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
@@ -21,6 +22,13 @@ type MetricsDB struct {
 	db *sql.DB
 }
 func (m *MetricsDB) Close() error {
 	if m == nil || m.db == nil {
 		return nil
 	}
 	return m.db.Close()
 }
 // openMetricsDB opens (or creates) the metrics database at the given path.
 func openMetricsDB(path string) (*MetricsDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
@@ -54,6 +62,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
  clock_mhz     REAL,
  mem_clock_mhz REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE IF NOT EXISTS fan_metrics (
@@ -70,6 +80,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
  PRIMARY KEY (ts, name)
 );
 `)
 	if err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
 	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
 }
 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
 	rows, err := db.Query("PRAGMA table_info(" + table + ")")
 	if err != nil {
 		return err
 	}
 	defer rows.Close()
 	for rows.Next() {
 		var cid int
 		var name, ctype string
 		var notNull, pk int
 		var dflt sql.NullString
 		if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
 			return err
 		}
 		if strings.EqualFold(name, column) {
 			return nil
 		}
 	}
 	if err := rows.Err(); err != nil {
 		return err
 	}
 	_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
 	return err
 }
@@ -91,8 +133,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	}
 	for _, g := range s.GPUs {
 		_, err = tx.Exec(
-			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
+			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
-			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
+			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
 		)
 		if err != nil {
 			return err
@@ -129,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
 	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
 }
 // LoadBetween returns samples in chronological order within the given time window.
 func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
 	if m == nil {
 		return nil, nil
 	}
 	if start.IsZero() || end.IsZero() {
 		return nil, nil
 	}
 	if end.Before(start) {
 		start, end = end, start
 	}
 	return m.loadSamples(
 		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
 		start.Unix(), end.Unix(),
 	)
 }
 // loadSamples reconstructs LiveMetricSample rows from the normalized tables.
 func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
 	rows, err := m.db.Query(query, args...)
@@ -163,7 +222,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	}
 	gpuData := map[gpuKey]platform.GPUMetricRow{}
 	gRows, err := m.db.Query(
-		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
+		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
 		minTS, maxTS,
 	)
 	if err == nil {
@@ -171,7 +230,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 		for gRows.Next() {
 			var ts int64
 			var g platform.GPUMetricRow
-			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
+			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
 				gpuData[gpuKey{ts, g.GPUIndex}] = g
 			}
 		}
@@ -283,7 +342,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	rows, err := m.db.Query(`
 		SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
-		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
+		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
 		       g.clock_mhz, g.mem_clock_mhz
 		FROM sys_metrics s
 		LEFT JOIN gpu_metrics g ON g.ts = s.ts
 		ORDER BY s.ts, g.gpu_index
@@ -294,13 +354,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	defer rows.Close()
 	cw := csv.NewWriter(w)
-	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
+	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
 	for rows.Next() {
 		var ts int64
 		var cpu, mem, pwr float64
 		var gpuIdx sql.NullInt64
-		var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
+		var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
-		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
+		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
 			continue
 		}
 		row := []string{
@@ -316,9 +376,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 				strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
 			)
 		} else {
-			row = append(row, "", "", "", "", "")
+			row = append(row, "", "", "", "", "", "", "")
 		}
 		_ = cw.Write(row)
 	}
@@ -326,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	return cw.Error()
 }
 // Close closes the database.
 func (m *MetricsDB) Close() { _ = m.db.Close() }
 func nullFloat(v float64) sql.NullFloat64 {
 	return sql.NullFloat64{Float64: v, Valid: true}
 }
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -1,11 +1,13 @@
 package webui
 import (
 	"database/sql"
 	"path/filepath"
 	"testing"
 	"time"
 	"bee/audit/internal/platform"
 	_ "modernc.org/sqlite"
 )
 func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
@@ -67,3 +69,106 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
 		}
 	}
 }
 func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "metrics.db")
 	raw, err := sql.Open("sqlite", path)
 	if err != nil {
 		t.Fatalf("sql.Open: %v", err)
 	}
 	_, err = raw.Exec(`
 CREATE TABLE gpu_metrics (
  ts            INTEGER NOT NULL,
  gpu_index     INTEGER NOT NULL,
  temp_c        REAL,
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE sys_metrics (
  ts           INTEGER NOT NULL,
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
  PRIMARY KEY (ts)
 );
 CREATE TABLE fan_metrics (
  ts   INTEGER NOT NULL,
  name TEXT NOT NULL,
  rpm  REAL,
  PRIMARY KEY (ts, name)
 );
 CREATE TABLE temp_metrics (
  ts      INTEGER NOT NULL,
  name    TEXT NOT NULL,
  grp     TEXT NOT NULL,
  celsius REAL,
  PRIMARY KEY (ts, name)
 );
 `)
 	if err != nil {
 		t.Fatalf("create legacy schema: %v", err)
 	}
 	_ = raw.Close()
 	db, err := openMetricsDB(path)
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	defer db.Close()
 	now := time.Unix(1_700_000_100, 0).UTC()
 	err = db.Write(platform.LiveMetricSample{
 		Timestamp: now,
 		GPUs: []platform.GPUMetricRow{
 			{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
 		},
 	})
 	if err != nil {
 		t.Fatalf("Write: %v", err)
 	}
 	samples, err := db.LoadAll()
 	if err != nil {
 		t.Fatalf("LoadAll: %v", err)
 	}
 	if len(samples) != 1 || len(samples[0].GPUs) != 1 {
 		t.Fatalf("samples=%+v", samples)
 	}
 	if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
 		t.Fatalf("ClockMHz=%v want 1410", got)
 	}
 	if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
 		t.Fatalf("MemClockMHz=%v want 2600", got)
 	}
 }
 func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
 	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	defer db.Close()
 	base := time.Unix(1_700_000_000, 0).UTC()
 	for i := 0; i < 5; i++ {
 		if err := db.Write(platform.LiveMetricSample{
 			Timestamp:  base.Add(time.Duration(i) * time.Minute),
 			CPULoadPct: float64(i),
 		}); err != nil {
 			t.Fatalf("Write(%d): %v", i, err)
 		}
 	}
 	got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
 	if err != nil {
 		t.Fatalf("LoadBetween: %v", err)
 	}
 	if len(got) != 3 {
 		t.Fatalf("LoadBetween len=%d want 3", len(got))
 	}
 	if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
 		t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
 	}
 }
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/serial_console.go
+++ b/audit/internal/webui/serial_console.go
@@ -0,0 +1,41 @@
 package webui
 import (
 	"fmt"
 	"os"
 	"strings"
 	"time"
 )
 var taskSerialWriteLine = writeTaskSerialLine
 func writeTaskSerialLine(line string) {
 	line = strings.TrimSpace(line)
 	if line == "" {
 		return
 	}
 	payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
 	for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
 		f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
 		if err != nil {
 			continue
 		}
 		_, _ = f.WriteString(payload)
 		_ = f.Close()
 		return
 	}
 }
 func taskSerialPrefix(t *Task) string {
 	if t == nil {
 		return "[task] "
 	}
 	return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
 }
 func taskSerialEvent(t *Task, event string) {
 	if t == nil {
 		return
 	}
 	taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
 }
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -1,15 +1,19 @@
 package webui
 import (
 	"bufio"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"html"
 	"io"
 	"log/slog"
 	"mime"
 	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -18,7 +22,6 @@ import (
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/runtimeenv"
 	gocharts "github.com/go-analyze/charts"
 	"reanimator/chart/viewer"
 	"reanimator/chart/web"
 )
@@ -164,6 +167,8 @@ type handler struct {
 	// pending network change (rollback on timeout)
 	pendingNet   *pendingNetChange
 	pendingNetMu sync.Mutex
 	// kmsg hardware error watcher
 	kmsg *kmsgWatcher
 }
 // NewHandler creates the HTTP mux with all routes.
@@ -203,12 +208,24 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	}
 	h.startMetricsCollector()
 	// Start kmsg hardware error watcher if the app (and its status DB) is available.
 	if opts.App != nil {
 		h.kmsg = newKmsgWatcher(opts.App.StatusDB)
 		h.kmsg.start()
 		globalQueue.kmsgWatcher = h.kmsg
 	}
 	globalQueue.startWorker(&opts)
 	mux := http.NewServeMux()
 	// ── Infrastructure ──────────────────────────────────────────────────────
 	mux.HandleFunc("GET /healthz", h.handleHealthz)
 	mux.HandleFunc("GET /api/ready", h.handleReady)
 	mux.HandleFunc("GET /loading", func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Cache-Control", "no-store")
 		w.Header().Set("Content-Type", "text/html; charset=utf-8")
 		_, _ = w.Write([]byte(loadingPageHTML))
 	})
 	// ── Existing read-only endpoints (preserved for compatibility) ──────────
 	mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
@@ -225,6 +242,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// SAT
 	mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
 	mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
 	mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
 	mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
 	mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
 	mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
 	mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
 	mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
 	mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
 	mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -238,6 +261,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -246,6 +270,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
 	mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
 	mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
 	mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
 	mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
 	mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
 	// Services
 	mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
@@ -274,6 +301,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
 	mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
 	// System
@@ -300,11 +328,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("GET /", h.handlePage)
 	h.mux = mux
-	return mux
+	return recoverMiddleware(mux)
 }
 func (h *handler) startMetricsCollector() {
-	go func() {
+	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
 		for range ticker.C {
@@ -315,7 +343,7 @@ func (h *handler) startMetricsCollector() {
 			h.feedRings(sample)
 			h.setLatestMetric(sample)
 		}
-	}()
+	})
 }
 func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
@@ -336,7 +364,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
 // ListenAndServe starts the HTTP server.
 func ListenAndServe(addr string, opts HandlerOptions) error {
-	return http.ListenAndServe(addr, NewHandler(opts))
+	srv := &http.Server{
 		Addr:              addr,
 		Handler:           NewHandler(opts),
 		ReadHeaderTimeout: 5 * time.Second,
 		ReadTimeout:       30 * time.Second,
 		IdleTimeout:       2 * time.Minute,
 	}
 	return srv.ListenAndServe()
 }
 type trackingResponseWriter struct {
 	http.ResponseWriter
 	wroteHeader bool
 }
 func (w *trackingResponseWriter) WriteHeader(statusCode int) {
 	w.wroteHeader = true
 	w.ResponseWriter.WriteHeader(statusCode)
 }
 func (w *trackingResponseWriter) Write(p []byte) (int, error) {
 	w.wroteHeader = true
 	return w.ResponseWriter.Write(p)
 }
 func (w *trackingResponseWriter) Flush() {
 	w.wroteHeader = true
 	if f, ok := w.ResponseWriter.(http.Flusher); ok {
 		f.Flush()
 	}
 }
 func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
 	h, ok := w.ResponseWriter.(http.Hijacker)
 	if !ok {
 		return nil, nil, fmt.Errorf("hijacking not supported")
 	}
 	return h.Hijack()
 }
 func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
 	p, ok := w.ResponseWriter.(http.Pusher)
 	if !ok {
 		return http.ErrNotSupported
 	}
 	return p.Push(target, opts)
 }
 func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
 	rf, ok := w.ResponseWriter.(io.ReaderFrom)
 	if !ok {
 		return io.Copy(w.ResponseWriter, r)
 	}
 	w.wroteHeader = true
 	return rf.ReadFrom(r)
 }
 func recoverMiddleware(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		tw := &trackingResponseWriter{ResponseWriter: w}
 		defer func() {
 			if rec := recover(); rec != nil {
 				slog.Error("http handler panic",
 					"method", r.Method,
 					"path", r.URL.Path,
 					"panic", fmt.Sprint(rec),
 					"stack", string(debug.Stack()),
 				)
 				if !tw.wroteHeader {
 					http.Error(tw, "internal server error", http.StatusInternalServerError)
 				}
 			}
 		}()
 		next.ServeHTTP(tw, r)
 	})
 }
 // ── Infrastructure handlers ──────────────────────────────────────────────────
@@ -466,13 +568,44 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
+	samples, err := h.metricsDB.LoadAll()
 	if err != nil || len(samples) == 0 {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
 	timeline := metricsTimelineSegments(samples, time.Now())
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
 		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
 		if !ok {
 			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 			return
 		}
 		w.Header().Set("Content-Type", "image/svg+xml")
 		w.Header().Set("Cache-Control", "no-store")
 		_, _ = w.Write(buf)
 		return
 	}
 	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
-	buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
+	buf, err := renderMetricChartSVG(
 		title,
 		labels,
 		sampleTimes(samples),
 		datasets,
 		names,
 		yMin,
 		yMax,
 		chartCanvasHeightForPath(path, len(names)),
 		timeline,
 	)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -482,14 +615,6 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }
 func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
 	samples, err := h.metricsDB.LoadAll()
 	if err != nil || len(samples) == 0 {
 		return nil, nil, nil, "", nil, nil, false
 	}
 	return chartDataFromSamples(path, samples)
 }
 func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
 	var datasets [][]float64
 	var names []string
@@ -569,18 +694,24 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		yMin = floatPtr(0)
 		yMax = autoMax120(datasets...)
 	case path == "gpu-all-clock":
 		title = "GPU Core Clock"
 		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 		yMin, yMax = autoBounds120(datasets...)
 	case path == "gpu-all-memclock":
 		title = "GPU Memory Clock"
 		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 		yMin, yMax = autoBounds120(datasets...)
 	case strings.HasPrefix(path, "gpu/"):
-		rest := strings.TrimPrefix(path, "gpu/")
+		idx, sub, ok := parseGPUChartPath(path)
-		sub := ""
+		if !ok {
-		if i := strings.LastIndex(rest, "-"); i > 0 {
+			return nil, nil, nil, "", nil, nil, false
 			sub = rest[i+1:]
 			rest = rest[:i]
 		}
 		idx := 0
 		fmt.Sscanf(rest, "%d", &idx)
 		switch sub {
 		case "load":
-			title = fmt.Sprintf("GPU %d Load", idx)
+			title = gpuDisplayLabel(idx) + " Load"
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
@@ -591,7 +722,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			yMin = floatPtr(0)
 			yMax = floatPtr(100)
 		case "temp":
-			title = fmt.Sprintf("GPU %d Temperature", idx)
+			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -600,8 +731,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			names = []string{"Temp °C"}
 			yMin = floatPtr(0)
 			yMax = autoMax120(temp)
 		case "clock":
 			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
 				return nil, nil, nil, "", nil, nil, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Core Clock MHz"}
 			yMin, yMax = autoBounds120(clock)
 		case "memclock":
 			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
 				return nil, nil, nil, "", nil, nil, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Memory Clock MHz"}
 			yMin, yMax = autoBounds120(clock)
 		default:
-			title = fmt.Sprintf("GPU %d Power", idx)
+			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -618,6 +767,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
 }
 func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
 	if !strings.HasPrefix(path, "gpu/") {
 		return 0, "", false
 	}
 	rest := strings.TrimPrefix(path, "gpu/")
 	if rest == "" {
 		return 0, "", false
 	}
 	sub = ""
 	if i := strings.LastIndex(rest, "-"); i > 0 {
 		sub = rest[i+1:]
 		rest = rest[:i]
 	}
 	n, err := fmt.Sscanf(rest, "%d", &idx)
 	if err != nil || n != 1 {
 		return 0, "", false
 	}
 	return idx, sub, true
 }
 func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
 	labels := make([]string, len(samples))
 	if len(samples) == 0 {
@@ -710,7 +879,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
 			continue
 		}
 		datasets = append(datasets, ds)
-		names = append(names, fmt.Sprintf("GPU %d", idx))
+		names = append(names, gpuDisplayLabel(idx))
 	}
 	return datasets, names
 }
@@ -843,64 +1012,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
 	return floatPtr(low), floatPtr(high)
 }
-// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
+func gpuChartLabelIndices(total, target int) []int {
-func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
+	if total <= 0 {
-	n := len(labels)
+		return nil
 	if n == 0 {
 		n = 1
 		labels = []string{""}
 	}
-	for i := range datasets {
+	if total == 1 {
-		if len(datasets[i]) == 0 {
+		return []int{0}
 			datasets[i] = make([]float64, n)
 		}
 	}
-	// Append global min/avg/max to title.
+	step := total / target
-	mn, avg, mx := globalStats(datasets)
+	if step < 1 {
-	if mx > 0 {
+		step = 1
 		title = fmt.Sprintf("%s    ↓%s  ~%s  ↑%s",
 			title,
 			chartLegendNumber(mn),
 			chartLegendNumber(avg),
 			chartLegendNumber(mx),
 		)
 	}
-	title = sanitizeChartText(title)
+	var indices []int
-	names = sanitizeChartTexts(names)
+	for i := 0; i < total; i += step {
-	sparse := sanitizeChartTexts(sparseLabels(labels, 6))
+		indices = append(indices, i)
 	}
 	if indices[len(indices)-1] != total-1 {
 		indices = append(indices, total-1)
 	}
 	return indices
 }
-	opt := gocharts.NewLineChartOptionWithData(datasets)
+func chartCanvasHeightForPath(path string, seriesCount int) int {
-	opt.Title = gocharts.TitleOption{Text: title}
+	height := chartCanvasHeight(seriesCount)
-	opt.XAxis.Labels = sparse
+	if isGPUChartPath(path) {
-	opt.Legend = gocharts.LegendOption{SeriesNames: names}
+		return height * 2
 	if chartLegendVisible(len(names)) {
 		opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
 		opt.Legend.OverlayChart = gocharts.Ptr(false)
 	} else {
 		opt.Legend.Show = gocharts.Ptr(false)
 	}
 	opt.Symbol = gocharts.SymbolNone
 	// Right padding: reserve space for the MarkLine label (library recommendation).
 	opt.Padding = gocharts.NewBox(20, 20, 80, 20)
 	if yMin != nil || yMax != nil {
 		opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
 	}
 	return height
 }
-	// Add a single peak mark line on the series that holds the global maximum.
+func isGPUChartPath(path string) bool {
-	peakIdx, _ := globalPeakSeries(datasets)
+	return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
 	if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
 		opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
 	}
 	p := gocharts.NewPainter(gocharts.PainterOptions{
 		OutputFormat: gocharts.ChartOutputSVG,
 		Width:        1400,
 		Height:       chartCanvasHeight(len(names)),
 	}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
 	if err := p.LineChart(opt); err != nil {
 		return nil, err
 	}
 	return p.Bytes()
 }
 func chartLegendVisible(seriesCount int) bool {
@@ -914,30 +1056,6 @@ func chartCanvasHeight(seriesCount int) int {
 	return 288
 }
 func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
 	return gocharts.YAxisOption{
 		Min:            yMin,
 		Max:            yMax,
 		LabelCount:     11,
 		ValueFormatter: chartYAxisNumber,
 	}
 }
 // globalPeakSeries returns the index of the series containing the global maximum
 // value across all datasets, and that maximum value.
 func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
 	idx = -1
 	for i, ds := range datasets {
 		for _, v := range ds {
 			if v > peak {
 				peak = v
 				idx = i
 			}
 		}
 	}
 	return idx, peak
 }
 // globalStats returns min, average, and max across all values in all datasets.
 func globalStats(datasets [][]float64) (mn, avg, mx float64) {
 	var sum float64
@@ -977,21 +1095,6 @@ func sanitizeChartText(s string) string {
 	}, s))
 }
 func sanitizeChartTexts(in []string) []string {
 	out := make([]string, len(in))
 	for i, s := range in {
 		out[i] = sanitizeChartText(s)
 	}
 	return out
 }
 func safeIdx(s []float64, i int) float64 {
 	if i < len(s) {
 		return s[i]
 	}
 	return 0
 }
 func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
 	var datasets [][]float64
 	var names []string
@@ -1078,20 +1181,6 @@ func chartYAxisNumber(v float64) string {
 	return out
 }
 func sparseLabels(labels []string, n int) []string {
 	out := make([]string, len(labels))
 	step := len(labels) / n
 	if step < 1 {
 		step = 1
 	}
 	for i, l := range labels {
 		if i%step == 0 {
 			out[i] = l
 		}
 	}
 	return out
 }
 func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
 	if h.metricsDB == nil {
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
@@ -1107,6 +1196,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
 func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Cache-Control", "no-store")
 	if strings.TrimSpace(h.opts.AuditPath) == "" {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte("ready"))
 		return
 	}
 	if _, err := os.Stat(h.opts.AuditPath); err != nil {
 		w.WriteHeader(http.StatusServiceUnavailable)
 		_, _ = w.Write([]byte("starting"))
@@ -1120,37 +1214,106 @@ const loadingPageHTML = `<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="UTF-8">
-<title>EASY-BEE</title>
+<title>EASY-BEE — Starting</title>
 <style>
 *{margin:0;padding:0;box-sizing:border-box}
 html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
-.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
+.wrap{text-align:center;width:420px}
-.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
+.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
 .subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
 .spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
 .spinner.hidden{display:none}
@keyframes spin{to{transform:rotate(360deg)}}
-.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
+.status{font-size:13px;color:#a0aec0;margin-bottom:20px;min-height:18px}
 table{width:100%;border-collapse:collapse;font-size:12px;margin-bottom:20px;display:none}
 td{padding:3px 6px;text-align:left}
 td:first-child{color:#718096;width:55%}
 .ok{color:#68d391}
 .run{color:#f6c90e}
 .fail{color:#fc8181}
 .dim{color:#4a5568}
 .btn{background:#1a202c;color:#a0aec0;border:1px solid #2d3748;padding:7px 18px;font-size:12px;cursor:pointer;font-family:inherit;display:none}
 .btn:hover{border-color:#718096;color:#e2e8f0}
 </style>
 </head>
 <body>
-<div style="text-align:center">
+<div class="wrap">
  <div class="logo">  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗
  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝
  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗
  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝
  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗
  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝</div>
-  <div class="spinner"></div>
+  <div class="subtitle">Hardware Audit LiveCD</div>
-  <div class="status" id="s">Starting up...</div>
+  <div class="spinner" id="spin"></div>
  <div class="status" id="st">Connecting to bee-web...</div>
  <table id="tbl"></table>
  <button class="btn" id="btn" onclick="go()">Open app now</button>
 </div>
 <script>
-function probe(){
+(function(){
-  fetch('/api/ready',{cache:'no-store'})
+var gone = false;
-    .then(function(r){
+function go(){ if(!gone){gone=true;window.location.replace('/');} }
-      if(r.ok){window.location.replace('/');}
+
-      else{setTimeout(probe,1000);}
+function icon(s){
  if(s==='active')   return '<span class="ok">&#9679; active</span>';
  if(s==='failed')   return '<span class="fail">&#10005; failed</span>';
  if(s==='activating'||s==='reloading') return '<span class="run">&#9675; starting</span>';
  if(s==='inactive') return '<span class="dim">&#9675; inactive</span>';
  return '<span class="dim">'+s+'</span>';
 }
 function allSettled(svcs){
  for(var i=0;i<svcs.length;i++){
    var s=svcs[i].state;
    if(s!=='active'&&s!=='failed'&&s!=='inactive') return false;
  }
  return true;
 }
 var pollTimer=null;
 function pollServices(){
  fetch('/api/services',{cache:'no-store'})
    .then(function(r){return r.json();})
    .then(function(svcs){
      if(!svcs||!svcs.length) return;
      var tbl=document.getElementById('tbl');
      tbl.style.display='';
      var html='';
      for(var i=0;i<svcs.length;i++)
        html+='<tr><td>'+svcs[i].name+'</td><td>'+icon(svcs[i].state)+'</td></tr>';
      tbl.innerHTML=html;
      if(allSettled(svcs)){
        clearInterval(pollTimer);
        document.getElementById('spin').className='spinner hidden';
        document.getElementById('st').textContent='Ready \u2014 opening...';
        setTimeout(go,800);
      }
    })
-    .catch(function(){setTimeout(probe,1000);});
+    .catch(function(){});
 }
 function probe(){
  fetch('/healthz',{cache:'no-store'})
    .then(function(r){
      if(r.ok){
        document.getElementById('st').textContent='bee-web running \u2014 checking services...';
        document.getElementById('btn').style.display='';
        pollServices();
        pollTimer=setInterval(pollServices,1500);
      } else {
        document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
        setTimeout(probe,500);
      }
    })
    .catch(function(){
      document.getElementById('st').textContent='Waiting for bee-web to start...';
      setTimeout(probe,500);
    });
 }
 probe();
 })();
 </script>
 </body>
 </html>`
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -1,6 +1,7 @@
 package webui
 import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -34,6 +35,49 @@ func TestChartLegendNumber(t *testing.T) {
 	}
 }
 func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
 	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		panic("boom")
 	}))
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/panic", nil)
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusInternalServerError {
 		t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
 	}
 	if !strings.Contains(rec.Body.String(), "internal server error") {
 		t.Fatalf("body=%q", rec.Body.String())
 	}
 }
 func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
 	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if !sseStart(w) {
 			return
 		}
 		if !sseWrite(w, "tick", "ok") {
 			t.Fatal("expected sse write to succeed")
 		}
 	}))
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/stream", nil)
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
 		t.Fatalf("content-type=%q", got)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
 		t.Fatalf("body=%q", body)
 	}
 }
 func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
@@ -136,6 +180,39 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 	}
 }
 func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
 			Timestamp: time.Now().Add(-2 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, ClockMHz: 1400},
 				{GPUIndex: 3, ClockMHz: 1500},
 			},
 		},
 		{
 			Timestamp: time.Now().Add(-1 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, ClockMHz: 1410},
 				{GPUIndex: 3, ClockMHz: 1510},
 			},
 		},
 	}
 	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
 	if title != "GPU Core Clock" {
 		t.Fatalf("title=%q", title)
 	}
 	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
 		t.Fatalf("names=%v", names)
 	}
 	if got := datasets[1][1]; got != 1510 {
 		t.Fatalf("GPU 3 core clock=%v want 1510", got)
 	}
 }
 func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
 	want := []float64{0, 480, 480, 480, 510, 510}
@@ -157,6 +234,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
 	if !strings.Contains(body, "el.dataset.loading === '1'") {
 		t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
 	}
 	if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
 		t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
 	}
 	if !strings.Contains(body, `id="gpu-chart-toggle"`) {
 		t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
 	}
 	if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
 		t.Fatalf("metrics page should include GPU core clock chart: %s", body)
 	}
 	if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
 		t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
 	}
 	if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
 		t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
 	}
 }
 func TestChartLegendVisible(t *testing.T) {
@@ -199,6 +291,124 @@ func TestChartCanvasHeight(t *testing.T) {
 	}
 }
 func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	end := start.Add(10 * time.Minute)
 	taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
 		s := start.Add(offsetStart)
 		e := start.Add(offsetEnd)
 		return Task{
 			Name:      "task",
 			Status:    TaskDone,
 			StartedAt: &s,
 			DoneAt:    &e,
 		}
 	}
 	segments := chartTimelineSegmentsForRange(start, end, end, []Task{
 		taskWindow(1*time.Minute, 3*time.Minute),
 		taskWindow(2*time.Minute, 5*time.Minute),
 		taskWindow(7*time.Minute, 8*time.Minute),
 	})
 	if len(segments) != 5 {
 		t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
 	}
 	wantActive := []bool{false, true, false, true, false}
 	wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
 	for i, segment := range segments {
 		if segment.Active != wantActive[i] {
 			t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
 		}
 		if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
 			t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
 		}
 		if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
 			t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
 		}
 	}
 }
 func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	labels := []string{"12:00", "12:01", "12:02"}
 	times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
 	svg, err := renderMetricChartSVG(
 		"System Power",
 		labels,
 		times,
 		[][]float64{{300, 320, 310}},
 		[]string{"Power W"},
 		floatPtr(0),
 		floatPtr(400),
 		360,
 		[]chartTimelineSegment{
 			{Start: start, End: start.Add(time.Minute), Active: false},
 			{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
 		},
 	)
 	if err != nil {
 		t.Fatal(err)
 	}
 	body := string(svg)
 	if !strings.Contains(body, `data-role="timeline-overlay"`) {
 		t.Fatalf("svg missing timeline overlay: %s", body)
 	}
 	if !strings.Contains(body, `opacity="0.10"`) {
 		t.Fatalf("svg missing idle overlay opacity: %s", body)
 	}
 	if !strings.Contains(body, `System Power`) {
 		t.Fatalf("svg missing chart title: %s", body)
 	}
 }
 func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
 	dir := t.TempDir()
 	db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	t.Cleanup(func() { _ = db.db.Close() })
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	for i, sample := range []platform.LiveMetricSample{
 		{Timestamp: start, PowerW: 300},
 		{Timestamp: start.Add(time.Minute), PowerW: 320},
 		{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
 	} {
 		if err := db.Write(sample); err != nil {
 			t.Fatalf("write sample %d: %v", i, err)
 		}
 	}
 	globalQueue.mu.Lock()
 	prevTasks := globalQueue.tasks
 	s := start.Add(30 * time.Second)
 	e := start.Add(90 * time.Second)
 	globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = prevTasks
 		globalQueue.mu.Unlock()
 	})
 	h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
 	h.handleMetricsChartSVG(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `data-role="timeline-overlay"`) {
 		t.Fatalf("custom svg response missing timeline overlay: %s", body)
 	}
 	if !strings.Contains(body, `stroke-linecap="round"`) {
 		t.Fatalf("custom svg response missing custom polyline styling: %s", body)
 	}
 }
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -212,21 +422,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	}
 }
 func TestChartYAxisOption(t *testing.T) {
 	min := floatPtr(0)
 	max := floatPtr(100)
 	opt := chartYAxisOption(min, max)
 	if opt.Min != min || opt.Max != max {
 		t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
 	}
 	if opt.LabelCount != 11 {
 		t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
 	}
 	if got := opt.ValueFormatter(1000); got != "1к" {
 		t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
 	}
 }
 func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
 	r1 := newMetricsRing(4)
 	r2 := newMetricsRing(4)
@@ -275,9 +470,10 @@ func TestRootRendersDashboard(t *testing.T) {
 	}
 	handler := NewHandler(HandlerOptions{
-		Title:     "Bee Hardware Audit",
+		Title:      "Bee Hardware Audit",
-		AuditPath: path,
+		BuildLabel: "1.2.3",
-		ExportDir: exportDir,
+		AuditPath:  path,
 		ExportDir:  exportDir,
 	})
 	first := httptest.NewRecorder()
@@ -292,6 +488,11 @@ func TestRootRendersDashboard(t *testing.T) {
 	if !strings.Contains(first.Body.String(), `/viewer`) {
 		t.Fatalf("first body missing viewer link: %s", first.Body.String())
 	}
 	versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
 	navIdx := strings.Index(first.Body.String(), `href="/"`)
 	if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
 		t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
 	}
 	if got := first.Header().Get("Cache-Control"); got != "no-store" {
 		t.Fatalf("first cache-control=%q", got)
 	}
@@ -329,7 +530,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `Run Audit`) {
+	if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
 		t.Fatalf("dashboard missing run audit button: %s", body)
 	}
 	if strings.Contains(body, `No audit data`) {
@@ -337,6 +538,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
 	}
 }
 func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	if strings.TrimSpace(rec.Body.String()) != "ready" {
 		t.Fatalf("body=%q want ready", rec.Body.String())
 	}
 }
 func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
@@ -359,7 +572,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	}
 }
-func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
+func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
@@ -367,8 +580,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `id="task-log-overlay"`) {
+	if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
-		t.Fatalf("tasks page missing log modal overlay: %s", body)
+		t.Fatalf("tasks page missing task report hint: %s", body)
 	}
 	if !strings.Contains(body, `_taskPageSize = 50`) {
 		t.Fatalf("tasks page missing pagination size config: %s", body)
@@ -389,12 +602,295 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 	if !strings.Contains(body, `Restart GPU Drivers`) {
 		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
 	}
-	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
+	if !strings.Contains(body, `restartGPUDrivers()`) {
-		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
+		t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
 	}
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
 	}
 	if !strings.Contains(body, `Export to USB`) {
 		t.Fatalf("tools page missing export to usb section: %s", body)
 	}
 	if !strings.Contains(body, `Support Bundle</button>`) {
 		t.Fatalf("tools page missing support bundle usb button: %s", body)
 	}
 }
 func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`href="/benchmark"`,
 		`id="benchmark-gpu-list"`,
 		`/api/gpu/nvidia`,
 		`/api/benchmark/nvidia/run`,
 		`benchmark-run-nccl`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	dir := t.TempDir()
 	exportDir := filepath.Join(dir, "export")
 	runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	result := platform.NvidiaBenchmarkResult{
 		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
 		BenchmarkProfile: "standard",
 		OverallStatus:    "OK",
 		GPUs: []platform.BenchmarkGPUResult{
 			{
 				Index: 0,
 				Name:  "NVIDIA H100 PCIe",
 				Scores: platform.BenchmarkScorecard{
 					CompositeScore: 1176.25,
 				},
 			},
 			{
 				Index: 1,
 				Name:  "NVIDIA H100 PCIe",
 				Scores: platform.BenchmarkScorecard{
 					CompositeScore: 1168.50,
 				},
 			},
 		},
 	}
 	raw, err := json.Marshal(result)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
 		t.Fatal(err)
 	}
 	handler := NewHandler(HandlerOptions{ExportDir: exportDir})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
 	for _, needle := range []string{
 		`Benchmark Results`,
 		`Composite score by saved benchmark run and GPU.`,
 		`NVIDIA H100 PCIe / GPU 0`,
 		`NVIDIA H100 PCIe / GPU 1`,
 		`#1`,
 		wantTime,
 		`1176.25`,
 		`1168.50`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA GPU Targeted Stress`,
 		`nvidia-targeted-stress`,
 		`controlled NVIDIA DCGM load`,
 		`<code>dcgmi diag targeted_stress</code>`,
 		`NVIDIA GPU Selection`,
 		`All NVIDIA validate tasks use only the GPUs selected here.`,
 		`Select All`,
 		`id="sat-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
 		`targeted_stress remain in <a href="/validate">Validate</a>`,
 		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("burn page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestTaskDetailPageRendersSavedReport(t *testing.T) {
 	dir := t.TempDir()
 	exportDir := filepath.Join(dir, "export")
 	reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
 	if err := os.MkdirAll(reportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	reportPath := filepath.Join(reportDir, "report.html")
 	if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:             "task-1",
 		Name:           "CPU SAT",
 		Target:         "cpu",
 		Status:         TaskDone,
 		CreatedAt:      time.Now(),
 		ArtifactsDir:   reportDir,
 		ReportHTMLPath: reportPath,
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `saved report`) {
 		t.Fatalf("task detail page missing saved report: %s", body)
 	}
 	if !strings.Contains(body, `Back to Tasks`) {
 		t.Fatalf("task detail page missing back link: %s", body)
 	}
 }
 func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:        "task-live-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskRunning,
 		CreatedAt: time.Now(),
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `Cancel</button>`) {
 		t.Fatalf("task detail page missing cancel button: %s", body)
 	}
 	if !strings.Contains(body, `function cancelTaskDetail(id)`) {
 		t.Fatalf("task detail page missing cancel handler: %s", body)
 	}
 	if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
 		t.Fatalf("task detail page missing cancel endpoint: %s", body)
 	}
 	if !strings.Contains(body, `id="task-live-charts"`) {
 		t.Fatalf("task detail page missing live charts container: %s", body)
 	}
 	if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
 		t.Fatalf("task detail page missing live charts index endpoint: %s", body)
 	}
 }
 func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
 	dir := t.TempDir()
 	metricsPath := filepath.Join(dir, "metrics.db")
 	prevMetricsPath := taskReportMetricsDBPath
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
 	db, err := openMetricsDB(metricsPath)
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	base := time.Now().UTC()
 	samples := []platform.LiveMetricSample{
 		{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
 		{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
 		{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
 	}
 	for _, sample := range samples {
 		if err := db.Write(sample); err != nil {
 			t.Fatalf("Write: %v", err)
 		}
 	}
 	_ = db.Close()
 	started := base.Add(-2*time.Minute - 5*time.Second)
 	done := base.Add(-1*time.Minute + 5*time.Second)
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:        "task-chart-1",
 		Name:      "Power Window",
 		Target:    "cpu",
 		Status:    TaskDone,
 		CreatedAt: started.Add(-10 * time.Second),
 		StartedAt: &started,
 		DoneAt:    &done,
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
 	req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
 	req.SetPathValue("id", "task-chart-1")
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, "System Power") {
 		t.Fatalf("task chart missing expected title: %s", body)
 	}
 	if !strings.Contains(body, "min 200") {
 		t.Fatalf("task chart stats should start from in-window sample: %s", body)
 	}
 	if strings.Contains(body, "min 100") {
 		t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
 	}
 }
 func TestViewerRendersLatestSnapshot(t *testing.T) {
@@ -518,3 +1014,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
 		t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
 	}
 }
 func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
 	exportDir := filepath.Join(dir, "export")
 	if err := os.MkdirAll(exportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	health := `{
  "status":"PARTIAL",
  "checked_at":"2026-03-16T10:00:00Z",
  "export_dir":"/tmp/export",
  "driver_ready":true,
  "cuda_ready":false,
  "network_status":"PARTIAL",
  "issues":[
    {"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
    {"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
  ],
  "tools":[
    {"name":"dmidecode","ok":true},
    {"name":"nvidia-smi","ok":false}
  ],
  "services":[
    {"name":"bee-web","status":"active"},
    {"name":"bee-nvidia","status":"inactive"}
  ]
 }`
 	if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
 		t.Fatal(err)
 	}
 	componentStatus := `[
  {
    "component_key":"cpu:all",
    "status":"Warning",
    "error_summary":"cpu SAT: FAILED",
    "history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
  },
  {
    "component_key":"memory:all",
    "status":"OK",
    "history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
  },
  {
    "component_key":"storage:nvme0n1",
    "status":"Critical",
    "error_summary":"storage SAT: FAILED",
    "history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
  },
  {
    "component_key":"pcie:gpu:nvidia",
    "status":"Warning",
    "error_summary":"nvidia SAT: FAILED",
    "history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
  }
 ]`
 	if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
 		t.Fatal(err)
 	}
 	handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`Runtime Health`,
 		`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
 		`Export Directory`,
 		`Network`,
 		`NVIDIA/AMD Driver`,
 		`CUDA / ROCm`,
 		`Required Utilities`,
 		`Bee Services`,
 		`<td>CPU</td>`,
 		`<td>Memory</td>`,
 		`<td>Storage</td>`,
 		`<td>GPU</td>`,
 		`CUDA runtime is not ready for GPU SAT.`,
 		`Missing: nvidia-smi`,
 		`bee-nvidia=inactive`,
 		`cpu SAT: FAILED`,
 		`storage SAT: FAILED`,
 		`sat:nvidia`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("dashboard missing %q: %s", needle, body)
 		}
 	}
 }
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -0,0 +1,42 @@
 package webui
 import (
 	"fmt"
 	"log/slog"
 	"runtime/debug"
 	"time"
 )
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
 		for {
 			if !runRecoverable(name, fn) {
 				return
 			}
 			if restartDelay > 0 {
 				time.Sleep(restartDelay)
 			}
 		}
 	}()
 }
 func goRecoverOnce(name string, fn func()) {
 	go func() {
 		_ = runRecoverable(name, fn)
 	}()
 }
 func runRecoverable(name string, fn func()) (panicked bool) {
 	defer func() {
 		if rec := recover(); rec != nil {
 			panicked = true
 			slog.Error("recovered panic",
 				"component", name,
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 		}
 	}()
 	fn()
 	return false
 }
--- a/audit/internal/webui/task_page.go
+++ b/audit/internal/webui/task_page.go
@@ -0,0 +1,267 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"net/http"
 	"os"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
 )
 func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
 	id := r.PathValue("id")
 	task, ok := globalQueue.findByID(id)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	snapshot := *task
 	body := renderTaskDetailPage(h.opts, snapshot)
 	w.Header().Set("Cache-Control", "no-store")
 	w.Header().Set("Content-Type", "text/html; charset=utf-8")
 	_, _ = w.Write([]byte(body))
 }
 func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
 	task, samples, _, _, ok := h.taskSamplesForRequest(r)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	type taskChartIndexEntry struct {
 		Title string `json:"title"`
 		File  string `json:"file"`
 	}
 	entries := make([]taskChartIndexEntry, 0)
 	for _, spec := range taskChartSpecsForSamples(samples) {
 		title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
 		if !ok {
 			continue
 		}
 		entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
 	}
 	w.Header().Set("Cache-Control", "no-store")
 	w.Header().Set("Content-Type", "application/json; charset=utf-8")
 	_ = json.NewEncoder(w).Encode(entries)
 }
 func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
 	task, samples, _, _, ok := h.taskSamplesForRequest(r)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
 	path, ok := taskChartPathFromFile(file)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
 	if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
 	w.Header().Set("Content-Type", "image/svg+xml")
 	w.Header().Set("Cache-Control", "no-store")
 	_, _ = w.Write(buf)
 }
 func renderTaskDetailPage(opts HandlerOptions, task Task) string {
 	title := task.Name
 	if strings.TrimSpace(title) == "" {
 		title = task.ID
 	}
 	var body strings.Builder
 	body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
 	body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
 	if task.Status == TaskRunning || task.Status == TaskPending {
 		body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
 	}
 	body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
 	body.WriteString(`</div>`)
 	if report := loadTaskReportFragment(task); report != "" {
 		body.WriteString(report)
 	} else {
 		body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
 		body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
 		body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
 		if strings.TrimSpace(task.ErrMsg) != "" {
 			body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
 		}
 		body.WriteString(`</div></div>`)
 	}
 	if task.Status == TaskRunning {
 		body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
 		body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
 		body.WriteString(`</div></div>`)
 	}
 	if task.Status == TaskRunning || task.Status == TaskPending {
 		body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
 		body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
 		body.WriteString(`</div></div>`)
 		body.WriteString(`<script>
 function cancelTaskDetail(id) {
  fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
    var term = document.getElementById('task-live-log');
    if (term) {
      term.textContent += '\nCancel requested.\n';
      term.scrollTop = term.scrollHeight;
    }
  });
 }
 function renderTaskLiveCharts(taskId, charts) {
  const host = document.getElementById('task-live-charts');
  if (!host) return;
  if (!Array.isArray(charts) || charts.length === 0) {
    host.innerHTML = 'Waiting for metric samples...';
    return;
  }
  const seen = {};
  charts.forEach(function(chart) {
    seen[chart.file] = true;
    let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
    if (img) {
      const card = img.closest('.card');
      if (card) {
        const title = card.querySelector('.card-head');
        if (title) title.textContent = chart.title;
      }
      return;
    }
    const card = document.createElement('div');
    card.className = 'card';
    card.style.margin = '0';
    card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
    card.querySelector('.card-head').textContent = chart.title;
    const body = card.querySelector('.card-body');
    img = document.createElement('img');
    img.setAttribute('data-task-chart', '1');
    img.setAttribute('data-chart-file', chart.file);
    img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
    img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
    img.style.width = '100%';
    img.style.display = 'block';
    img.style.borderRadius = '6px';
    img.alt = chart.title;
    body.appendChild(img);
    host.appendChild(card);
  });
  Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
    const file = img.getAttribute('data-chart-file') || '';
    if (seen[file]) return;
    const card = img.closest('.card');
    if (card) card.remove();
  });
 }
 function loadTaskLiveCharts(taskId) {
  fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
    renderTaskLiveCharts(taskId, charts);
  }).catch(function(){
    const host = document.getElementById('task-live-charts');
    if (host) host.innerHTML = 'Task charts are unavailable.';
  });
 }
 function refreshTaskLiveCharts() {
  document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
    const base = img.dataset.baseSrc;
    if (!base) return;
    img.src = base + '?t=' + Date.now();
  });
 }
 var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
 var _taskDetailTerm = document.getElementById('task-live-log');
 var _taskChartTimer = null;
 var _taskChartsFrozen = false;
 _taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
 _taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
 _taskDetailES.addEventListener('done', function(e){
  if (_taskChartTimer) clearInterval(_taskChartTimer);
  _taskDetailES.close();
  _taskDetailES = null;
  _taskChartsFrozen = true;
  _taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
  _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
  refreshTaskLiveCharts();
 });
 _taskDetailES.onerror = function(){
  if (_taskChartTimer) clearInterval(_taskChartTimer);
  if (_taskDetailES) {
    _taskDetailES.close();
    _taskDetailES = null;
  }
 };
 loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
 _taskChartTimer = setInterval(function(){
  if (_taskChartsFrozen) return;
  loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
  refreshTaskLiveCharts();
 }, 2000);
 </script>`)
 	}
 	return layoutHead(opts.Title+" — "+title) +
 		layoutNav("tasks", opts.BuildLabel) +
 		`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
 		body.String() +
 		`</div></div></body></html>`
 }
 func loadTaskReportFragment(task Task) string {
 	if strings.TrimSpace(task.ReportHTMLPath) == "" {
 		return ""
 	}
 	data, err := os.ReadFile(task.ReportHTMLPath)
 	if err != nil || len(data) == 0 {
 		return ""
 	}
 	return string(data)
 }
 func taskArtifactDownloadLink(task Task, absPath string) string {
 	if strings.TrimSpace(absPath) == "" {
 		return ""
 	}
 	return fmt.Sprintf(`/export/file?path=%s`, absPath)
 }
 func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
 	id := r.PathValue("id")
 	taskPtr, ok := globalQueue.findByID(id)
 	if !ok {
 		return Task{}, nil, time.Time{}, time.Time{}, false
 	}
 	task := *taskPtr
 	start, end := taskTimeWindow(&task)
 	samples, err := loadTaskMetricSamples(start, end)
 	if err != nil {
 		return task, nil, start, end, true
 	}
 	return task, samples, start, end, true
 }
 func taskTimelineForTask(task Task) []chartTimelineSegment {
 	start, end := taskTimeWindow(&task)
 	return []chartTimelineSegment{{Start: start, End: end, Active: true}}
 }
 func taskChartPathFromFile(file string) (string, bool) {
 	file = strings.TrimSpace(file)
 	for _, spec := range taskDashboardChartSpecs {
 		if spec.File == file {
 			return spec.Path, true
 		}
 	}
 	if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
 		id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
 		return "gpu/" + id + "-overview", true
 	}
 	return "", false
 }
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -0,0 +1,343 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"os"
 	"path/filepath"
 	"sort"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
 )
 var taskReportMetricsDBPath = metricsDBPath
 type taskReport struct {
 	ID          string            `json:"id"`
 	Name        string            `json:"name"`
 	Target      string            `json:"target"`
 	Status      string            `json:"status"`
 	CreatedAt   time.Time         `json:"created_at"`
 	StartedAt   *time.Time        `json:"started_at,omitempty"`
 	DoneAt      *time.Time        `json:"done_at,omitempty"`
 	DurationSec int               `json:"duration_sec,omitempty"`
 	Error       string            `json:"error,omitempty"`
 	LogFile     string            `json:"log_file,omitempty"`
 	Charts      []taskReportChart `json:"charts,omitempty"`
 	GeneratedAt time.Time         `json:"generated_at"`
 }
 type taskReportChart struct {
 	Title string `json:"title"`
 	File  string `json:"file"`
 }
 type taskChartSpec struct {
 	Path string
 	File string
 }
 var taskDashboardChartSpecs = []taskChartSpec{
 	{Path: "server-load", File: "server-load.svg"},
 	{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
 	{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
 	{Path: "server-power", File: "server-power.svg"},
 	{Path: "server-fans", File: "server-fans.svg"},
 	{Path: "gpu-all-load", File: "gpu-all-load.svg"},
 	{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
 	{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
 	{Path: "gpu-all-power", File: "gpu-all-power.svg"},
 	{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
 }
 func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
 	specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
 	specs = append(specs, taskDashboardChartSpecs...)
 	for _, idx := range taskGPUIndices(samples) {
 		specs = append(specs, taskChartSpec{
 			Path: fmt.Sprintf("gpu/%d-overview", idx),
 			File: fmt.Sprintf("gpu-%d-overview.svg", idx),
 		})
 	}
 	return specs
 }
 func writeTaskReportArtifacts(t *Task) error {
 	if t == nil {
 		return nil
 	}
 	ensureTaskReportPaths(t)
 	if strings.TrimSpace(t.ArtifactsDir) == "" {
 		return nil
 	}
 	if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
 		return err
 	}
 	start, end := taskTimeWindow(t)
 	samples, _ := loadTaskMetricSamples(start, end)
 	charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
 	logText := ""
 	if data, err := os.ReadFile(t.LogPath); err == nil {
 		logText = string(data)
 	}
 	report := taskReport{
 		ID:          t.ID,
 		Name:        t.Name,
 		Target:      t.Target,
 		Status:      t.Status,
 		CreatedAt:   t.CreatedAt,
 		StartedAt:   t.StartedAt,
 		DoneAt:      t.DoneAt,
 		DurationSec: taskElapsedSec(t, reportDoneTime(t)),
 		Error:       t.ErrMsg,
 		LogFile:     filepath.Base(t.LogPath),
 		Charts:      charts,
 		GeneratedAt: time.Now().UTC(),
 	}
 	if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
 		return err
 	}
 	return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
 }
 func reportDoneTime(t *Task) time.Time {
 	if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
 		return *t.DoneAt
 	}
 	return time.Now()
 }
 func taskTimeWindow(t *Task) (time.Time, time.Time) {
 	if t == nil {
 		now := time.Now().UTC()
 		return now, now
 	}
 	start := t.CreatedAt.UTC()
 	if t.StartedAt != nil && !t.StartedAt.IsZero() {
 		start = t.StartedAt.UTC()
 	}
 	end := time.Now().UTC()
 	if t.DoneAt != nil && !t.DoneAt.IsZero() {
 		end = t.DoneAt.UTC()
 	}
 	if end.Before(start) {
 		end = start
 	}
 	return start, end
 }
 func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
 	db, err := openMetricsDB(taskReportMetricsDBPath)
 	if err != nil {
 		return nil, err
 	}
 	defer db.Close()
 	return db.LoadBetween(start, end)
 }
 func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
 	if len(samples) == 0 {
 		return nil, nil
 	}
 	timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
 	var charts []taskReportChart
 	inline := make(map[string]string)
 	for _, spec := range taskChartSpecsForSamples(samples) {
 		title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
 		if !ok || len(svg) == 0 {
 			continue
 		}
 		path := filepath.Join(dir, spec.File)
 		if err := os.WriteFile(path, svg, 0644); err != nil {
 			continue
 		}
 		charts = append(charts, taskReportChart{Title: title, File: spec.File})
 		inline[spec.File] = string(svg)
 	}
 	return charts, inline
 }
 func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
 		buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil || !hasData {
 			return "", nil, false
 		}
 		return gpuDisplayLabel(idx) + " Overview", buf, true
 	}
 	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
 	}
 	buf, err := renderMetricChartSVG(
 		title,
 		labels,
 		sampleTimes(samples),
 		datasets,
 		names,
 		yMin,
 		yMax,
 		chartCanvasHeightForPath(path, len(names)),
 		timeline,
 	)
 	if err != nil {
 		return "", nil, false
 	}
 	return title, buf, true
 }
 func taskGPUIndices(samples []platform.LiveMetricSample) []int {
 	seen := map[int]bool{}
 	var out []int
 	for _, s := range samples {
 		for _, g := range s.GPUs {
 			if seen[g.GPUIndex] {
 				continue
 			}
 			seen[g.GPUIndex] = true
 			out = append(out, g.GPUIndex)
 		}
 	}
 	sort.Ints(out)
 	return out
 }
 func writeJSONFile(path string, v any) error {
 	data, err := json.MarshalIndent(v, "", "  ")
 	if err != nil {
 		return err
 	}
 	return os.WriteFile(path, data, 0644)
 }
 func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
 	b.WriteString(`<div class="grid2">`)
 	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
 	b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
 	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
 	if strings.TrimSpace(report.Error) != "" {
 		b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
 	}
 	b.WriteString(`</div></div>`)
 	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
 	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
 	b.WriteString(`</div></div></div>`)
 	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
 		b.WriteString(benchmarkCard)
 	}
 	if len(report.Charts) > 0 {
 		for _, chart := range report.Charts {
 			b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
 			b.WriteString(charts[chart.File])
 			b.WriteString(`</div></div>`)
 		}
 	} else {
 		b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
 	}
 	b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
 	b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
 	b.WriteString(`</div></div>`)
 	return b.String()
 }
 func renderTaskBenchmarkResultsCard(target, logText string) string {
 	if strings.TrimSpace(target) != "nvidia-benchmark" {
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
 	if strings.TrimSpace(resultPath) == "" {
 		return ""
 	}
 	columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
 	if len(runs) == 0 {
 		return ""
 	}
 	return renderBenchmarkResultsCardFromRuns(
 		"Benchmark Results",
 		"Composite score for this benchmark task.",
 		"No benchmark results were saved for this task.",
 		columns,
 		runs,
 	)
 }
 func taskBenchmarkResultPath(logText string) string {
 	archivePath := taskArchivePathFromLog(logText)
 	if archivePath == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	if runDir == archivePath {
 		return ""
 	}
 	return filepath.Join(runDir, "result.json")
 }
 func taskArchivePathFromLog(logText string) string {
 	lines := strings.Split(logText, "\n")
 	for i := len(lines) - 1; i >= 0; i-- {
 		line := strings.TrimSpace(lines[i])
 		if line == "" || !strings.HasPrefix(line, "Archive:") {
 			continue
 		}
 		path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
 		if strings.HasPrefix(path, "Archive written to ") {
 			path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
 		}
 		if strings.HasSuffix(path, ".tar.gz") {
 			return path
 		}
 	}
 	return ""
 }
 func renderTaskStatusBadge(status string) string {
 	className := map[string]string{
 		TaskRunning:   "badge-ok",
 		TaskPending:   "badge-unknown",
 		TaskDone:      "badge-ok",
 		TaskFailed:    "badge-err",
 		TaskCancelled: "badge-unknown",
 	}[status]
 	if className == "" {
 		className = "badge-unknown"
 	}
 	label := strings.TrimSpace(status)
 	if label == "" {
 		label = "unknown"
 	}
 	return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
 }
 func formatTaskTime(ts *time.Time, fallback time.Time) string {
 	if ts != nil && !ts.IsZero() {
 		return ts.Local().Format("2006-01-02 15:04:05")
 	}
 	if !fallback.IsZero() {
 		return fallback.Local().Format("2006-01-02 15:04:05")
 	}
 	return "n/a"
 }
 func formatTaskDuration(sec int) string {
 	if sec <= 0 {
 		return "n/a"
 	}
 	if sec < 60 {
 		return fmt.Sprintf("%ds", sec)
 	}
 	if sec < 3600 {
 		return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
 	}
 	return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
 }
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -4,10 +4,12 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
 	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -28,22 +30,29 @@ const (
 // taskNames maps target → human-readable name for validate (SAT) runs.
 var taskNames = map[string]string{
-	"nvidia":          "NVIDIA SAT",
+	"nvidia":                 "NVIDIA SAT",
-	"nvidia-stress":   "NVIDIA GPU Stress",
+	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
-	"memory":          "Memory SAT",
+	"nvidia-benchmark":       "NVIDIA Benchmark",
-	"storage":         "Storage SAT",
+	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
-	"cpu":             "CPU SAT",
+	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
-	"amd":             "AMD GPU SAT",
+	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
-	"amd-mem":         "AMD GPU MEM Integrity",
+	"nvidia-interconnect":    "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
-	"amd-bandwidth":   "AMD GPU MEM Bandwidth",
+	"nvidia-bandwidth":       "NVIDIA Bandwidth Test (NVBandwidth)",
-	"amd-stress":      "AMD GPU Burn-in",
+	"nvidia-stress":          "NVIDIA GPU Stress",
-	"memory-stress":   "Memory Burn-in",
+	"memory":                 "Memory SAT",
-	"sat-stress":      "SAT Stress (stressapptest)",
+	"storage":                "Storage SAT",
-	"platform-stress": "Platform Thermal Cycling",
+	"cpu":                    "CPU SAT",
-	"audit":           "Audit",
+	"amd":                    "AMD GPU SAT",
-	"support-bundle":  "Support Bundle",
+	"amd-mem":                "AMD GPU MEM Integrity",
-	"install":         "Install to Disk",
+	"amd-bandwidth":          "AMD GPU MEM Bandwidth",
-	"install-to-ram":  "Install to RAM",
+	"amd-stress":             "AMD GPU Burn-in",
 	"memory-stress":          "Memory Burn-in",
 	"sat-stress":             "SAT Stress (stressapptest)",
 	"platform-stress":        "Platform Thermal Cycling",
 	"audit":                  "Audit",
 	"support-bundle":         "Support Bundle",
 	"install":                "Install to Disk",
 	"install-to-ram":         "Install to RAM",
 }
 // burnNames maps target → human-readable name when a burn profile is set.
@@ -83,17 +92,20 @@ func taskDisplayName(target, profile, loader string) string {
 // Task represents one unit of work in the queue.
 type Task struct {
-	ID         string     `json:"id"`
+	ID             string     `json:"id"`
-	Name       string     `json:"name"`
+	Name           string     `json:"name"`
-	Target     string     `json:"target"`
+	Target         string     `json:"target"`
-	Priority   int        `json:"priority"`
+	Priority       int        `json:"priority"`
-	Status     string     `json:"status"`
+	Status         string     `json:"status"`
-	CreatedAt  time.Time  `json:"created_at"`
+	CreatedAt      time.Time  `json:"created_at"`
-	StartedAt  *time.Time `json:"started_at,omitempty"`
+	StartedAt      *time.Time `json:"started_at,omitempty"`
-	DoneAt     *time.Time `json:"done_at,omitempty"`
+	DoneAt         *time.Time `json:"done_at,omitempty"`
-	ElapsedSec int        `json:"elapsed_sec,omitempty"`
+	ElapsedSec     int        `json:"elapsed_sec,omitempty"`
-	ErrMsg     string     `json:"error,omitempty"`
+	ErrMsg         string     `json:"error,omitempty"`
-	LogPath    string     `json:"log_path,omitempty"`
+	LogPath        string     `json:"log_path,omitempty"`
 	ArtifactsDir   string     `json:"artifacts_dir,omitempty"`
 	ReportJSONPath string     `json:"report_json_path,omitempty"`
 	ReportHTMLPath string     `json:"report_html_path,omitempty"`
 	// runtime fields (not serialised)
 	job    *jobState
@@ -106,80 +118,96 @@ type taskParams struct {
 	DiagLevel          int      `json:"diag_level,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
 	SizeMB             int      `json:"size_mb,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
 }
 type persistedTask struct {
-	ID        string     `json:"id"`
+	ID             string     `json:"id"`
-	Name      string     `json:"name"`
+	Name           string     `json:"name"`
-	Target    string     `json:"target"`
+	Target         string     `json:"target"`
-	Priority  int        `json:"priority"`
+	Priority       int        `json:"priority"`
-	Status    string     `json:"status"`
+	Status         string     `json:"status"`
-	CreatedAt time.Time  `json:"created_at"`
+	CreatedAt      time.Time  `json:"created_at"`
-	StartedAt *time.Time `json:"started_at,omitempty"`
+	StartedAt      *time.Time `json:"started_at,omitempty"`
-	DoneAt    *time.Time `json:"done_at,omitempty"`
+	DoneAt         *time.Time `json:"done_at,omitempty"`
-	ErrMsg    string     `json:"error,omitempty"`
+	ErrMsg         string     `json:"error,omitempty"`
-	LogPath   string     `json:"log_path,omitempty"`
+	LogPath        string     `json:"log_path,omitempty"`
-	Params    taskParams `json:"params,omitempty"`
+	ArtifactsDir   string     `json:"artifacts_dir,omitempty"`
 	ReportJSONPath string     `json:"report_json_path,omitempty"`
 	ReportHTMLPath string     `json:"report_html_path,omitempty"`
 	Params         taskParams `json:"params,omitempty"`
 }
 type burnPreset struct {
 	NvidiaDiag  int
 	DurationSec int
 }
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
-		return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
+		return burnPreset{DurationSec: 8 * 60 * 60}
 	case "acceptance":
-		return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
+		return burnPreset{DurationSec: 60 * 60}
 	default:
-		return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
+		return burnPreset{DurationSec: 5 * 60}
 	}
 }
 func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
 	acceptanceCycles := []platform.PlatformStressCycle{
 		{LoadSec: 85, IdleSec: 5},
 		{LoadSec: 80, IdleSec: 10},
 		{LoadSec: 55, IdleSec: 5},
 		{LoadSec: 60, IdleSec: 0},
 		{LoadSec: 100, IdleSec: 10},
 		{LoadSec: 145, IdleSec: 15},
 		{LoadSec: 190, IdleSec: 20},
 		{LoadSec: 235, IdleSec: 25},
 		{LoadSec: 280, IdleSec: 30},
 		{LoadSec: 325, IdleSec: 35},
 		{LoadSec: 370, IdleSec: 40},
 		{LoadSec: 415, IdleSec: 45},
 		{LoadSec: 460, IdleSec: 50},
 		{LoadSec: 510, IdleSec: 0},
 	}
 	switch profile {
 	case "overnight":
-		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
+		cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
-			{LoadSec: 600, IdleSec: 120},
+		for range 8 {
-			{LoadSec: 600, IdleSec: 60},
+			cycles = append(cycles, acceptanceCycles...)
-			{LoadSec: 600, IdleSec: 30},
+		}
-			{LoadSec: 600, IdleSec: 120},
+		return platform.PlatformStressOptions{Cycles: cycles}
 			{LoadSec: 600, IdleSec: 60},
 			{LoadSec: 600, IdleSec: 30},
 			{LoadSec: 600, IdleSec: 120},
 			{LoadSec: 600, IdleSec: 60},
 		}}
 	case "acceptance":
-		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
+		return platform.PlatformStressOptions{Cycles: acceptanceCycles}
 			{LoadSec: 300, IdleSec: 60},
 			{LoadSec: 300, IdleSec: 30},
 			{LoadSec: 300, IdleSec: 60},
 			{LoadSec: 300, IdleSec: 30},
 		}}
 	default: // smoke
 		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
-			{LoadSec: 90, IdleSec: 60},
+			{LoadSec: 85, IdleSec: 5},
-			{LoadSec: 90, IdleSec: 30},
+			{LoadSec: 80, IdleSec: 10},
 			{LoadSec: 55, IdleSec: 5},
 			{LoadSec: 60, IdleSec: 0},
 		}}
 	}
 }
 // taskQueue manages a priority-ordered list of tasks and runs them one at a time.
 type taskQueue struct {
-	mu        sync.Mutex
+	mu          sync.Mutex
-	tasks     []*Task
+	tasks       []*Task
-	trigger   chan struct{}
+	trigger     chan struct{}
-	opts      *HandlerOptions // set by startWorker
+	opts        *HandlerOptions // set by startWorker
-	statePath string
+	statePath   string
-	logsDir   string
+	logsDir     string
-	started   bool
+	started     bool
 	kmsgWatcher *kmsgWatcher
 }
 var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
@@ -231,6 +259,7 @@ func (q *taskQueue) enqueue(t *Task) {
 	q.prune()
 	q.persistLocked()
 	q.mu.Unlock()
 	taskSerialEvent(t, "queued")
 	select {
 	case q.trigger <- struct{}{}:
 	default:
@@ -376,7 +405,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 	if !q.started {
 		q.loadLocked()
 		q.started = true
-		go q.worker()
+		goRecoverLoop("task worker", 2*time.Second, q.worker)
 	}
 	hasPending := q.nextPending() != nil
 	q.mu.Unlock()
@@ -391,47 +420,101 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 func (q *taskQueue) worker() {
 	for {
 		<-q.trigger
-		setCPUGovernor("performance")
+		func() {
-		for {
+			setCPUGovernor("performance")
-			q.mu.Lock()
+			defer setCPUGovernor("powersave")
 			t := q.nextPending()
 			if t == nil {
 				q.mu.Unlock()
 				break
 			}
 			now := time.Now()
 			t.Status = TaskRunning
 			t.StartedAt = &now
 			t.DoneAt = nil
 			t.ErrMsg = ""
 			j := newTaskJobState(t.LogPath)
 			ctx, cancel := context.WithCancel(context.Background())
 			j.cancel = cancel
 			t.job = j
 			q.persistLocked()
 			q.mu.Unlock()
-			q.runTask(t, j, ctx)
+			for {
-
+				q.mu.Lock()
-			q.mu.Lock()
+				t := q.nextPending()
-			now2 := time.Now()
+				if t == nil {
-			t.DoneAt = &now2
+					q.prune()
-			if t.Status == TaskRunning { // not cancelled externally
+					q.persistLocked()
-				if j.err != "" {
+					q.mu.Unlock()
-					t.Status = TaskFailed
+					return
 					t.ErrMsg = j.err
 				} else {
 					t.Status = TaskDone
 				}
 				now := time.Now()
 				t.Status = TaskRunning
 				t.StartedAt = &now
 				t.DoneAt = nil
 				t.ErrMsg = ""
 				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
 				t.job = j
 				q.persistLocked()
 				q.mu.Unlock()
 				taskCtx, taskCancel := context.WithCancel(context.Background())
 				j.cancel = taskCancel
 				q.executeTask(t, j, taskCtx)
 				taskCancel()
 				q.mu.Lock()
 				q.prune()
 				q.persistLocked()
 				q.mu.Unlock()
 			}
-			q.prune()
+		}()
-			q.persistLocked()
+
 			q.mu.Unlock()
 		}
 		setCPUGovernor("powersave")
 	}
 }
 func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
 	startedKmsgWatch := false
 	defer q.finalizeTaskRun(t, j)
 	defer func() {
 		if startedKmsgWatch && q.kmsgWatcher != nil {
 			q.kmsgWatcher.NotifyTaskFinished(t.ID)
 		}
 	}()
 	defer func() {
 		if rec := recover(); rec != nil {
 			msg := fmt.Sprintf("task panic: %v", rec)
 			slog.Error("task panic",
 				"task_id", t.ID,
 				"target", t.Target,
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 			j.append("ERROR: " + msg)
 			j.finish(msg)
 		}
 	}()
 	if q.kmsgWatcher != nil && isSATTarget(t.Target) {
 		q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
 		startedKmsgWatch = true
 	}
 	q.runTask(t, j, ctx)
 }
 func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	q.mu.Lock()
 	now := time.Now()
 	t.DoneAt = &now
 	if t.Status == TaskRunning {
 		if j.err != "" {
 			t.Status = TaskFailed
 			t.ErrMsg = j.err
 		} else {
 			t.Status = TaskDone
 			t.ErrMsg = ""
 		}
 	}
 	q.finalizeTaskArtifactPathsLocked(t)
 	q.persistLocked()
 	q.mu.Unlock()
 	if err := writeTaskReportArtifacts(t); err != nil {
 		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
 	}
 	if t.ErrMsg != "" {
 		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
 		return
 	}
 	taskSerialEvent(t, "finished with status="+t.Status)
 }
 // setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
 // Silently ignores errors (e.g. when cpufreq is not available).
 func setCPUGovernor(governor string) {
@@ -470,9 +553,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			break
 		}
 		diagLevel := t.params.DiagLevel
 		if t.params.BurnProfile != "" && diagLevel <= 0 {
 			diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
 		}
 		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
 			result, e := a.RunNvidiaAcceptancePackWithOptions(
 				ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -485,6 +565,79 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		} else {
 			archive, err = a.RunNvidiaAcceptancePack("", j.append)
 		}
 	case "nvidia-targeted-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if dur <= 0 {
 			dur = 300
 		}
 		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
 	case "nvidia-benchmark":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
 			Profile:           t.params.BenchmarkProfile,
 			SizeMB:            t.params.SizeMB,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
 			ParallelGPUs:      t.params.ParallelGPUs,
 		}, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
 	case "nvidia-targeted-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
 	case "nvidia-pulse":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
 	case "nvidia-bandwidth":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-interconnect":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec: dur,
 			Loader:      platform.NvidiaStressLoaderNCCL,
 			GPUIndices:  t.params.GPUIndices,
 		}, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -618,6 +771,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		return
 	}
 	// If the SAT archive was produced, check overall_status and write to component DB.
 	if archive != "" {
 		archivePath := app.ExtractArchivePath(archive)
 		if err == nil {
 			if app.ReadSATOverallStatus(archivePath) == "FAILED" {
 				err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
 			}
 		}
 		if db := q.statusDB(); db != nil {
 			app.ApplySATResultToDB(db, t.Target, archivePath)
 		}
 	}
 	if err != nil {
 		if ctx.Err() != nil {
 			j.append("Aborted.")
@@ -634,6 +800,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	j.finish("")
 }
 func (q *taskQueue) statusDB() *app.ComponentStatusDB {
 	if q.opts == nil || q.opts.App == nil {
 		return nil
 	}
 	return q.opts.App.StatusDB
 }
 func splitLines(s string) []string {
 	var out []string
 	for _, l := range splitNL(s) {
@@ -679,6 +852,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		now := time.Now()
 		t.DoneAt = &now
 		globalQueue.persistLocked()
 		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	case TaskRunning:
 		if t.job != nil {
@@ -688,6 +862,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		now := time.Now()
 		t.DoneAt = &now
 		globalQueue.persistLocked()
 		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	default:
 		writeError(w, http.StatusConflict, "task is not running or pending")
@@ -728,6 +903,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 		case TaskPending:
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		case TaskRunning:
 			if t.job != nil {
@@ -735,6 +911,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		}
 	}
@@ -753,6 +930,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 		case TaskPending:
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
 			cancelled++
 		case TaskRunning:
 			if t.job != nil {
@@ -760,6 +938,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
 			cancelled++
 		}
 	}
@@ -823,10 +1002,10 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
 }
 func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
-	if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
+	if q.logsDir == "" || t.ID == "" {
 		return
 	}
-	t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
+	q.ensureTaskArtifactPathsLocked(t)
 }
 func (q *taskQueue) loadLocked() {
@@ -843,17 +1022,20 @@ func (q *taskQueue) loadLocked() {
 	}
 	for _, pt := range persisted {
 		t := &Task{
-			ID:        pt.ID,
+			ID:             pt.ID,
-			Name:      pt.Name,
+			Name:           pt.Name,
-			Target:    pt.Target,
+			Target:         pt.Target,
-			Priority:  pt.Priority,
+			Priority:       pt.Priority,
-			Status:    pt.Status,
+			Status:         pt.Status,
-			CreatedAt: pt.CreatedAt,
+			CreatedAt:      pt.CreatedAt,
-			StartedAt: pt.StartedAt,
+			StartedAt:      pt.StartedAt,
-			DoneAt:    pt.DoneAt,
+			DoneAt:         pt.DoneAt,
-			ErrMsg:    pt.ErrMsg,
+			ErrMsg:         pt.ErrMsg,
-			LogPath:   pt.LogPath,
+			LogPath:        pt.LogPath,
-			params:    pt.Params,
+			ArtifactsDir:   pt.ArtifactsDir,
 			ReportJSONPath: pt.ReportJSONPath,
 			ReportHTMLPath: pt.ReportHTMLPath,
 			params:         pt.Params,
 		}
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
@@ -884,17 +1066,20 @@ func (q *taskQueue) persistLocked() {
 	state := make([]persistedTask, 0, len(q.tasks))
 	for _, t := range q.tasks {
 		state = append(state, persistedTask{
-			ID:        t.ID,
+			ID:             t.ID,
-			Name:      t.Name,
+			Name:           t.Name,
-			Target:    t.Target,
+			Target:         t.Target,
-			Priority:  t.Priority,
+			Priority:       t.Priority,
-			Status:    t.Status,
+			Status:         t.Status,
-			CreatedAt: t.CreatedAt,
+			CreatedAt:      t.CreatedAt,
-			StartedAt: t.StartedAt,
+			StartedAt:      t.StartedAt,
-			DoneAt:    t.DoneAt,
+			DoneAt:         t.DoneAt,
-			ErrMsg:    t.ErrMsg,
+			ErrMsg:         t.ErrMsg,
-			LogPath:   t.LogPath,
+			LogPath:        t.LogPath,
-			Params:    t.params,
+			ArtifactsDir:   t.ArtifactsDir,
 			ReportJSONPath: t.ReportJSONPath,
 			ReportHTMLPath: t.ReportHTMLPath,
 			Params:         t.params,
 		})
 	}
 	data, err := json.MarshalIndent(state, "", "  ")
@@ -925,3 +1110,113 @@ func taskElapsedSec(t *Task, now time.Time) int {
 	}
 	return int(end.Sub(start).Round(time.Second) / time.Second)
 }
 func taskFolderStatus(status string) string {
 	status = strings.TrimSpace(strings.ToLower(status))
 	switch status {
 	case TaskRunning, TaskDone, TaskFailed, TaskCancelled:
 		return status
 	default:
 		return TaskPending
 	}
 }
 func sanitizeTaskFolderPart(s string) string {
 	s = strings.TrimSpace(strings.ToLower(s))
 	if s == "" {
 		return "task"
 	}
 	var b strings.Builder
 	lastDash := false
 	for _, r := range s {
 		isAlnum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
 		if isAlnum {
 			b.WriteRune(r)
 			lastDash = false
 			continue
 		}
 		if !lastDash {
 			b.WriteByte('-')
 			lastDash = true
 		}
 	}
 	out := strings.Trim(b.String(), "-")
 	if out == "" {
 		return "task"
 	}
 	return out
 }
 func taskArtifactsDir(root string, t *Task, status string) string {
 	if strings.TrimSpace(root) == "" || t == nil {
 		return ""
 	}
 	prefix := taskFolderNumberPrefix(t.ID)
 	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
 }
 func taskFolderNumberPrefix(taskID string) string {
 	taskID = strings.TrimSpace(taskID)
 	if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
 		num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
 		if len(num) == 3 {
 			allDigits := true
 			for _, r := range num {
 				if r < '0' || r > '9' {
 					allDigits = false
 					break
 				}
 			}
 			if allDigits {
 				return num
 			}
 		}
 	}
 	fallback := sanitizeTaskFolderPart(taskID)
 	if fallback == "" {
 		return "000"
 	}
 	return fallback
 }
 func ensureTaskReportPaths(t *Task) {
 	if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
 		return
 	}
 	if t.LogPath == "" || filepath.Base(t.LogPath) == "task.log" {
 		t.LogPath = filepath.Join(t.ArtifactsDir, "task.log")
 	}
 	t.ReportJSONPath = filepath.Join(t.ArtifactsDir, "report.json")
 	t.ReportHTMLPath = filepath.Join(t.ArtifactsDir, "report.html")
 }
 func (q *taskQueue) ensureTaskArtifactPathsLocked(t *Task) {
 	if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
 		return
 	}
 	if strings.TrimSpace(t.ArtifactsDir) == "" {
 		t.ArtifactsDir = taskArtifactsDir(q.logsDir, t, t.Status)
 	}
 	if t.ArtifactsDir != "" {
 		_ = os.MkdirAll(t.ArtifactsDir, 0755)
 	}
 	ensureTaskReportPaths(t)
 }
 func (q *taskQueue) finalizeTaskArtifactPathsLocked(t *Task) {
 	if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
 		return
 	}
 	q.ensureTaskArtifactPathsLocked(t)
 	dstDir := taskArtifactsDir(q.logsDir, t, t.Status)
 	if dstDir == "" {
 		return
 	}
 	if t.ArtifactsDir != "" && t.ArtifactsDir != dstDir {
 		if _, err := os.Stat(dstDir); err != nil {
 			_ = os.Rename(t.ArtifactsDir, dstDir)
 		}
 		t.ArtifactsDir = dstDir
 	}
 	ensureTaskReportPaths(t)
 }
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -2,6 +2,7 @@ package webui
 import (
 	"context"
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -12,6 +13,7 @@ import (
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
@@ -161,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	}
 }
 func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	origCounter := jobCounter.Load()
 	jobCounter.Store(0)
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 		jobCounter.Store(origCounter)
 	})
 	if got := newJobID("ignored"); got != "TASK-000" {
 		t.Fatalf("id=%q want TASK-000", got)
 	}
 	if got := newJobID("ignored"); got != "TASK-001" {
 		t.Fatalf("id=%q want TASK-001", got)
 	}
 }
 func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
 	root := t.TempDir()
 	task := &Task{
 		ID:   "TASK-007",
 		Name: "NVIDIA Benchmark",
 	}
 	got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
 	if !strings.HasPrefix(got, "007_") {
 		t.Fatalf("artifacts dir=%q want prefix 007_", got)
 	}
 }
 func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
 	dir := t.TempDir()
 	logPath := filepath.Join(dir, "task.log")
@@ -248,15 +284,205 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
 	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
 }
 func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
 	dir := t.TempDir()
 	metricsPath := filepath.Join(dir, "metrics.db")
 	prevMetricsPath := taskReportMetricsDBPath
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
 	db, err := openMetricsDB(metricsPath)
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	base := time.Now().UTC().Add(-45 * time.Second)
 	if err := db.Write(platform.LiveMetricSample{
 		Timestamp:  base,
 		CPULoadPct: 42,
 		MemLoadPct: 35,
 		PowerW:     510,
 	}); err != nil {
 		t.Fatalf("Write: %v", err)
 	}
 	_ = db.Close()
 	q := &taskQueue{
 		statePath: filepath.Join(dir, "tasks-state.json"),
 		logsDir:   filepath.Join(dir, "tasks"),
 		trigger:   make(chan struct{}, 1),
 	}
 	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	started := time.Now().UTC().Add(-90 * time.Second)
 	task := &Task{
 		ID:        "task-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskRunning,
 		CreatedAt: started.Add(-10 * time.Second),
 		StartedAt: &started,
 	}
 	q.assignTaskLogPathLocked(task)
 	appendJobLog(task.LogPath, "line-1")
 	job := newTaskJobState(task.LogPath)
 	job.finish("")
 	q.finalizeTaskRun(task, job)
 	if task.Status != TaskDone {
 		t.Fatalf("status=%q want %q", task.Status, TaskDone)
 	}
 	if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
 		t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
 	}
 	if _, err := os.Stat(task.ReportJSONPath); err != nil {
 		t.Fatalf("report json: %v", err)
 	}
 	if _, err := os.Stat(task.ReportHTMLPath); err != nil {
 		t.Fatalf("report html: %v", err)
 	}
 	var report taskReport
 	data, err := os.ReadFile(task.ReportJSONPath)
 	if err != nil {
 		t.Fatalf("ReadFile(report.json): %v", err)
 	}
 	if err := json.Unmarshal(data, &report); err != nil {
 		t.Fatalf("Unmarshal(report.json): %v", err)
 	}
 	if report.ID != task.ID || report.Status != TaskDone {
 		t.Fatalf("report=%+v", report)
 	}
 	if len(report.Charts) == 0 {
 		t.Fatalf("expected charts in report, got none")
 	}
 }
 func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	dir := t.TempDir()
 	metricsPath := filepath.Join(dir, "metrics.db")
 	prevMetricsPath := taskReportMetricsDBPath
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
 	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
 	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	result := platform.NvidiaBenchmarkResult{
 		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
 		BenchmarkProfile: "standard",
 		OverallStatus:    "OK",
 		GPUs: []platform.BenchmarkGPUResult{
 			{
 				Index: 0,
 				Name:  "NVIDIA H100 PCIe",
 				Scores: platform.BenchmarkScorecard{
 					CompositeScore: 1176.25,
 				},
 			},
 		},
 	}
 	raw, err := json.Marshal(result)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
 		t.Fatal(err)
 	}
 	artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
 	if err := os.MkdirAll(artifactsDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	task := &Task{
 		ID:           "task-bench",
 		Name:         "NVIDIA Benchmark",
 		Target:       "nvidia-benchmark",
 		Status:       TaskDone,
 		CreatedAt:    time.Now().UTC().Add(-time.Minute),
 		ArtifactsDir: artifactsDir,
 	}
 	ensureTaskReportPaths(task)
 	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
 	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := writeTaskReportArtifacts(task); err != nil {
 		t.Fatalf("writeTaskReportArtifacts: %v", err)
 	}
 	body, err := os.ReadFile(task.ReportHTMLPath)
 	if err != nil {
 		t.Fatalf("ReadFile(report.html): %v", err)
 	}
 	html := string(body)
 	for _, needle := range []string{
 		`Benchmark Results`,
 		`Composite score for this benchmark task.`,
 		`NVIDIA H100 PCIe / GPU 0`,
 		`1176.25`,
 	} {
 		if !strings.Contains(html, needle) {
 			t.Fatalf("report missing %q: %s", needle, html)
 		}
 	}
 }
 func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
 	var lines []string
 	prev := taskSerialWriteLine
 	taskSerialWriteLine = func(line string) { lines = append(lines, line) }
 	t.Cleanup(func() { taskSerialWriteLine = prev })
 	dir := t.TempDir()
 	q := &taskQueue{
 		statePath: filepath.Join(dir, "tasks-state.json"),
 		logsDir:   filepath.Join(dir, "tasks"),
 		trigger:   make(chan struct{}, 1),
 	}
 	task := &Task{
 		ID:        "task-serial-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskPending,
 		CreatedAt: time.Now().UTC(),
 	}
 	q.enqueue(task)
 	started := time.Now().UTC()
 	task.Status = TaskRunning
 	task.StartedAt = &started
 	job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
 	job.append("Starting CPU SAT...")
 	job.append("CPU stress duration: 60s")
 	job.finish("")
 	q.finalizeTaskRun(task, job)
 	joined := strings.Join(lines, "\n")
 	for _, needle := range []string{
 		"queued",
 		"Starting CPU SAT...",
 		"CPU stress duration: 60s",
 		"finished with status=done",
 	} {
 		if !strings.Contains(joined, needle) {
 			t.Fatalf("serial mirror missing %q in %q", needle, joined)
 		}
 	}
 }
 func TestResolveBurnPreset(t *testing.T) {
 	tests := []struct {
 		profile string
 		want    burnPreset
 	}{
-		{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
+		{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
-		{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
+		{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
-		{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
+		{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
-		{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
+		{profile: "", want: burnPreset{DurationSec: 5 * 60}},
 	}
 	for _, tc := range tests {
 		if got := resolveBurnPreset(tc.profile); got != tc.want {
@@ -467,3 +693,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
 		t.Fatalf("unexpected error: %q", j.err)
 	}
 }
 func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
 		opts:        &HandlerOptions{App: &app.App{}},
 		statePath:   filepath.Join(dir, "tasks-state.json"),
 		logsDir:     filepath.Join(dir, "tasks"),
 		kmsgWatcher: newKmsgWatcher(nil),
 	}
 	tk := &Task{
 		ID:        "cpu-panic-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskRunning,
 		CreatedAt: time.Now(),
 	}
 	j := &jobState{}
 	orig := runCPUAcceptancePackCtx
 	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
 		panic("boom")
 	}
 	defer func() { runCPUAcceptancePackCtx = orig }()
 	q.executeTask(tk, j, context.Background())
 	if tk.Status != TaskFailed {
 		t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
 	}
 	if tk.DoneAt == nil {
 		t.Fatal("expected done_at to be set")
 	}
 	if !strings.Contains(tk.ErrMsg, "task panic: boom") {
 		t.Fatalf("task error=%q", tk.ErrMsg)
 	}
 	if !strings.Contains(j.err, "task panic: boom") {
 		t.Fatalf("job error=%q", j.err)
 	}
 	q.kmsgWatcher.mu.Lock()
 	activeCount := q.kmsgWatcher.activeCount
 	window := q.kmsgWatcher.window
 	q.kmsgWatcher.mu.Unlock()
 	if activeCount != 0 {
 		t.Fatalf("activeCount=%d want 0", activeCount)
 	}
 	if window != nil {
 		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
 	}
 }
--- a/audit/scripts/resolve-version.sh
+++ b/audit/scripts/resolve-version.sh
@@ -0,0 +1,16 @@
 #!/bin/sh
 set -eu
 tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
 case "${tag}" in
 	v*)
 		printf '%s\n' "${tag#v}"
 		;;
 	"")
 		printf 'dev\n'
 		;;
 	*)
 		printf '%s\n' "${tag}"
 		;;
 esac
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -0,0 +1,248 @@
 # Benchmark clock calibration research
 ## Status
 In progress. Baseline data from production servers pending.
 ## Background
 The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
 before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
 `avg_steady_clock < locked_target * 0.90`.
 Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
 even a healthy GPU in a non-ideal server will sustain clocks well below boost.
 The 90% threshold has no empirical basis.
 ## Key observations (2026-04-06)
 ### H100 PCIe — new card, server not designed for it
 - avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
 - Thermal sustain: 0.0 (sw_thermal covers entire steady window)
 - Stability: 70.0 — clocks erratic, no equilibrium found
 - Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
 ### H200 NVL — new card, server not designed for it
 - avg clock = P95 = 1635 MHz (perfectly stable)
 - Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
 - Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
 - Degradation: power_capped, thermal_limited
 - Compute: 989 TOPS — card is computing correctly for its frequency
 ### Key insight
 The meaningful distinction is not *whether* the card throttles but *how stably*
 it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
 H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
 instability may reflect a more severe thermal mismatch or a card issue.
 `sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
 `hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
 ## Hypothesis for baseline
 After testing on servers designed for their GPUs (proper cooling):
 - Healthy GPU under sustained load will run at a stable fraction of boost
 - Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
 - Base clock (`clocks.base.gr`) may be a better reference than boost:
  a healthy card under real workload should comfortably exceed base clock
 ## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
 Source: external stress test tool, ~90s runs, designed server, adequate power.
 ### Healthy fingerprint
 - **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
 - **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
  - Avg steady (visual): **~1580–1620 MHz**
  - vs boost 1755 MHz: **~91–92%**
  - Oscillation is NORMAL — this is the boost algorithm balancing under power cap
  - Stable power + oscillating clocks = healthy power-cap behavior
 - **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
 - **Consistency**: all 10 samples within ±20 MHz — very repeatable
 ### Characteristic patten
 Flat power line + oscillating/declining clock line = GPU correctly managed by
 power cap algorithm. Do NOT flag this as instability.
 ### Clock CV implication
 The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
 The current `variance_too_high` threshold (StabilityScore < 85) may fire on
 healthy HBM2e PCIe cards. Needs recalibration.
 ---
 ## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
 Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
 Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
 ### GPU clock reference (from nvidia-smi, idle):
 - base_clock_mhz: **1095**
 - boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
 - achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
 - Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
 ### Observed under 700W sustained load (both samples nearly identical):
 - Power: ~700W flat — SXM slot, adequate power confirmed
 - Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
 - vs 1980 MHz (lock target): **72–74%** — severely below
 - vs 1755 MHz (nvidia-smi boost): **81–83%**
 - vs 1095 MHz (base): 130% — above base but far below expected for SXM
 - Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
 - Temperature: 38°C → 79–80°C (same rate as HBM2e)
 - Oscillation: present, similar character to HBM2e but at much lower frequency
 ### Diagnosis
 These restored cards are degraded. A healthy H100 SXM in a designed server
 (DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
 The 72–74% result is a clear signal of silicon or VRM degradation from the
 refurbishment process.
 ### Clock pattern note
 Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
 to images 19/20. Both sample sets show same degraded pattern — same batch.
 ---
 ## Baseline matrix (filled where data available)
 | GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
 |---|---|---|---|---|---|
 | H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
 | H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
 | H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
 | H200 NVL | designed | TBD | TBD | TBD | need baseline |
 ---
 ## H100 official spec (from NVIDIA datasheet)
 Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
 All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
 | Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
 |---|---|---|---|---|---|
 | H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
 | H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
 | H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
 | H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
 Notes:
 - SXM boards do NOT list FP8 peak in this table (field empty)
 - fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
 - Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
 ## Observed efficiency (H100 80GB PCIe, throttled server)
 From the report in this session (power+thermal throttle throughout steady):
 | Precision | Measured | Spec (dense) | % of spec |
 |---|---|---|---|
 | fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
 | fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
 | fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
 33–44% of spec is expected given sustained power+thermal throttle (avg clock
 1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
 actual frequency — the low TOPS comes from throttle, not silicon defect.
 ## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
 Format: without sparsity / with sparsity.
 | Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
 |---|---|---|---|---|---|
 | H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
 | H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
 ## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
 Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
 | Precision | Measured | Spec (dense) | % of spec |
 |---|---|---|---|
 | fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
 | fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
 | fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
 Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
 both are throttle-limited. Confirms that % of spec is not a quality signal,
 it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
 ## Real-world GEMM efficiency reference (2026-04-06, web research)
 Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
 worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
 ### What healthy systems actually achieve:
 - H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
 - cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
 - H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
 ### Our results vs expectation:
 | GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
 |---|---|---|---|---|
 | H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
 | H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
 Our results are roughly **half** of what a healthy system achieves even under throttle.
 This is NOT normal — 30-44% is not the industry baseline.
 ### Likely causes of the gap (in order of probability):
 1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
 2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
   Previous user may have set a lower limit via nvidia-smi -pl and it was not
   reset. Our normalization sets clock locks but does NOT reset power limit.
   Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
 3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
   8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
 ### Power limit gap analysis (H100 PCIe):
 - Avg clock 1384 MHz = 79% of boost 1755 MHz
 - Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
 - Actually measured: 329 TOPS = 55% of that estimate
 - Remaining gap after accounting for clock throttle: ~45%
 - Most likely explanation: enforced power limit < 350W TDP, further reducing
  sustainable clock beyond what sw_thermal alone would cause.
 ### Action item:
 Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
 so result.json shows if the card was pre-configured with a non-default limit.
 If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
 ### CPU/RAM impact on GPU FLOPS:
 None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
 CPU core count and host RAM are irrelevant.
 ## Compute efficiency metric (proposed, no hardcode)
 Instead of comparing TOPS to a hardcoded spec, compute:
  tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
 This is model-agnostic. A GPU computing correctly at its actual frequency
 will show a consistent tops_per_sm_per_ghz regardless of throttle level.
 A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
 normal clocks.
 SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
 (needs to be added to queryBenchmarkGPUInfo).
 Reference values to establish after baseline runs:
 - H100 PCIe fp16_tensor: TBD tops/SM/GHz
 - H100 SXM fp16_tensor: TBD tops/SM/GHz
 ## Proposed threshold changes (pending more data)
 1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
   91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
   capture the root cause.
 2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
   under power cap. Consider suppressing this flag when power is flat and usage
   is 100% (oscillation is expected). Or lower threshold to 70.
 3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
   ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
   would have been caught by this).
 Decision deferred until baseline on SXM designed servers collected.
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,7 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -606,6 +606,20 @@ struct prepared_profile {
 };
 static const struct profile_desc k_profiles[] = {
    {
        "fp64",
        "fp64",
        80,
        1,
        0,
        0,
        8,
        CUDA_R_64F,
        CUDA_R_64F,
        CUDA_R_64F,
        CUDA_R_64F,
        CUBLAS_COMPUTE_64F,
    },
    {
        "fp32_tf32",
        "fp32",
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
+            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
            exit 1
            ;;
    esac
 done
 case "$VARIANT" in
-    nvidia|amd|nogpu|all) ;;
+    nvidia|nvidia-legacy|amd|nogpu|all) ;;
-    *) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
+    *) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
 esac
 if [ "$CLEAN_CACHE" = "1" ]; then
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
           "${CACHE_DIR:?}/lb-packages"
    echo "=== cleaning live-build work dirs ==="
    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
    rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
    rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
    echo "=== caches cleared, proceeding with build ==="
 fi
@@ -180,6 +185,9 @@ case "$VARIANT" in
    nvidia)
        run_variant nvidia
        ;;
    nvidia-legacy)
        run_variant nvidia-legacy
        ;;
    amd)
        run_variant amd
        ;;
@@ -188,6 +196,7 @@ case "$VARIANT" in
        ;;
    all)
        run_variant nvidia
        run_variant nvidia-legacy
        run_variant amd
        run_variant nogpu
        ;;
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
-# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
+# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
 #
 # Downloads the official NVIDIA .run installer, extracts kernel modules and
-# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
+# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
 #   - open         -> kernel-open/ sources from the .run installer
 #   - proprietary  -> traditional proprietary kernel sources from the .run installer
 #
 # Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
 # are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
@@ -17,10 +19,19 @@ set -e
 NVIDIA_VERSION="$1"
 DIST_DIR="$2"
 DEBIAN_KERNEL_ABI="$3"
 NVIDIA_FLAVOR="${4:-open}"
-[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
-[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
-[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
 case "$NVIDIA_FLAVOR" in
    open|proprietary) ;;
    *)
        echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
        exit 1
        ;;
 esac
 KVER="${DEBIAN_KERNEL_ABI}-amd64"
 # On Debian, kernel headers are split into two packages:
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
 KDIR_ARCH="/usr/src/linux-headers-${KVER}"
 KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
-echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
+echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
-if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
+CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
    echo "=== installing linux-headers-${KVER} ==="
    DEBIAN_FRONTEND=noninteractive apt-get install -y \
        "linux-headers-${KVER}" \
        gcc make perl
 fi
 echo "kernel headers (arch):   $KDIR_ARCH"
 echo "kernel headers (common): $KDIR_COMMON"
 CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
-CACHE_LAYOUT_VERSION="2"
+CACHE_LAYOUT_VERSION="3"
 CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
 if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
        && [ -f "$CACHE_LAYOUT_MARKER" ] \
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
    exit 0
 fi
 if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
    echo "=== installing linux-headers-${KVER} ==="
    DEBIAN_FRONTEND=noninteractive apt-get install -y \
        "linux-headers-${KVER}" \
        gcc make perl
 fi
 echo "kernel headers (arch):   $KDIR_ARCH"
 echo "kernel headers (common): $KDIR_COMMON"
 # Download official NVIDIA .run installer with sha256 verification
 BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
 mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
 rm -rf "$EXTRACT_DIR"
 "$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
-# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
+# Find kernel source directory for the selected flavor.
 KERNEL_SRC=""
-for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
+if [ "$NVIDIA_FLAVOR" = "open" ]; then
-    [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
-done
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
-[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
+    done
 else
    for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
    done
 fi
 [ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
 echo "kernel source: $KERNEL_SRC"
 # Build kernel modules
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
 VENDOR_DIR="${REPO_ROOT}/iso/vendor"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 AUTH_KEYS=""
 BUILD_VARIANT="nvidia"
 BEE_GPU_VENDOR="nvidia"
 BEE_NVIDIA_MODULE_FLAVOR="open"
 # parse args
 while [ $# -gt 0 ]; do
    case "$1" in
        --authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
-        --variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
+        --variant) BUILD_VARIANT="$2"; shift 2 ;;
        *) echo "unknown arg: $1"; exit 1 ;;
    esac
 done
-case "$BEE_GPU_VENDOR" in
+case "$BUILD_VARIANT" in
-    nvidia|amd|nogpu) ;;
+    nvidia)
-    *) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
+        BEE_GPU_VENDOR="nvidia"
        BEE_NVIDIA_MODULE_FLAVOR="open"
        ;;
    nvidia-legacy)
        BEE_GPU_VENDOR="nvidia"
        BEE_NVIDIA_MODULE_FLAVOR="proprietary"
        ;;
    amd)
        BEE_GPU_VENDOR="amd"
        BEE_NVIDIA_MODULE_FLAVOR=""
        ;;
    nogpu)
        BEE_GPU_VENDOR="nogpu"
        BEE_NVIDIA_MODULE_FLAVOR=""
        ;;
    *)
        echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
        exit 1
        ;;
 esac
-BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
+BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
-OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
+OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
-export BEE_GPU_VENDOR
+export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
 . "${BUILDER_DIR}/VERSIONS"
 export PATH="$PATH:/usr/local/go/bin"
@@ -54,15 +74,8 @@ resolve_audit_version() {
        return 0
    fi
-    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
+    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
    if [ -z "${tag}" ]; then
        tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
    fi
    case "${tag}" in
        audit/v*)
            echo "${tag#audit/v}"
            return 0
            ;;
        v*)
            echo "${tag#v}"
            return 0
@@ -309,6 +322,12 @@ memtest_fail() {
    return 0
 }
 nvidia_runtime_fail() {
    msg="$1"
    echo "ERROR: ${msg}" >&2
    exit 1
 }
 iso_memtest_present() {
    iso_path="$1"
    iso_files="$(mktemp)"
@@ -446,6 +465,44 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }
 validate_iso_nvidia_runtime() {
    iso_path="$1"
    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
    echo "=== validating NVIDIA runtime in ISO ==="
    [ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
    require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
    command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
    squashfs_tmp="$(mktemp)"
    squashfs_list="$(mktemp)"
    iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
    }
    unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
    }
    grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
    }
    grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
    }
    grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
    }
    rm -f "$squashfs_tmp" "$squashfs_list"
    echo "=== NVIDIA runtime validation OK ==="
 }
 append_memtest_grub_entry() {
    grub_cfg="$1"
    [ -f "$grub_cfg" ] || return 1
@@ -590,7 +647,7 @@ recover_iso_memtest() {
 AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
 ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
-ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
+ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
 # Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
 OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
 mkdir -p "${OUT_DIR}"
@@ -764,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
    apt-get install -y "linux-headers-${KVER}"
 fi
-echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
 echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
 echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
 echo ""
@@ -834,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    fi
 fi
-echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
+echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
 # Sync builder config into variant work dir, preserving lb cache.
@@ -860,6 +917,86 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi
 if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
 source /boot/grub/config.cfg
 echo ""
 echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
 echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
 echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
 echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 submenu "EASY-BEE (advanced options) -->" {
    menuentry "EASY-BEE — KMS (no nomodeset)" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — fail-safe" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
        initrd  @INITRD_LIVE@
    }
 }
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
    }
 else
    menuentry "Memory Test (memtest86+)" {
        linux16 /boot/memtest86+x64.bin
    }
 fi
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "UEFI Firmware Settings" {
        fwsetup
    }
 fi
 EOF
    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
 label live-@FLAVOUR@-normal
    menu label ^EASY-BEE
    menu default
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.display=kms
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ toram
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
 label memtest
    menu label ^Memory Test (memtest86+)
    linux /boot/memtest86+x64.bin
 EOF
 fi
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -944,10 +1081,10 @@ done
 # --- NVIDIA kernel modules and userspace libs ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
-        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
+        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
    KVER="${DEBIAN_KERNEL_ABI}-amd64"
-    NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
+    NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
    # Inject .ko files into overlay at /usr/local/lib/nvidia/
    OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
@@ -1018,13 +1155,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
 NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
 CUBLAS_VERSION=${CUBLAS_VERSION}
 CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
 NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
 JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
-    GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
+    GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
    GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
    GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
@@ -1036,6 +1174,7 @@ fi
 cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
 BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
 BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
 BEE_BUILD_VARIANT=${BUILD_VARIANT}
 BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
 BUILD_DATE=${BUILD_DATE}
 GIT_COMMIT=${GIT_COMMIT}
@@ -1046,6 +1185,11 @@ EOF
 # Write GPU vendor marker for hooks
 echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
 else
    rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
 fi
 # Patch motd with build info
 BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
@@ -1116,10 +1260,10 @@ fi
 # --- build ISO using live-build ---
 echo ""
-echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
 # Export for auto/config
-BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
+BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER
 cd "${LB_DIR}"
@@ -1151,9 +1295,10 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
-    echo "=== done (${BEE_GPU_VENDOR}) ==="
+    echo "=== done (${BUILD_VARIANT}) ==="
    echo "ISO: $ISO_OUT"
    if command -v stat >/dev/null 2>&1; then
        ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -7,6 +7,7 @@ echo "  █████╗  ███████║███████╗ ╚
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
 echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
@@ -14,29 +15,21 @@ menuentry "EASY-BEE" {
    initrd  @INITRD_LIVE@
 }
-menuentry "EASY-BEE (graphics/KMS)" {
+submenu "EASY-BEE (advanced options) -->" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    menuentry "EASY-BEE — GSP=off" {
-    initrd  @INITRD_LIVE@
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-}
+        initrd  @INITRD_LIVE@
    }
-menuentry "EASY-BEE (load to RAM)" {
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
+        initrd  @INITRD_LIVE@
-}
+    }
-menuentry "EASY-BEE (NVIDIA GSP=off)" {
+    menuentry "EASY-BEE — fail-safe" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-    initrd  @INITRD_LIVE@
+        initrd  @INITRD_LIVE@
-}
+    }
 menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (fail-safe)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
    initrd  @INITRD_LIVE@
 }
 if [ "${grub_platform}" = "efi" ]; then
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -30,6 +30,8 @@ systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
 systemctl enable bee-web.service
 systemctl enable bee-sshsetup.service
 systemctl enable bee-selfheal.timer
 systemctl enable bee-boot-status.service
 systemctl enable ssh.service
 systemctl enable lightdm.service 2>/dev/null || true
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -58,6 +60,8 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
 chmod +x /usr/local/bin/bee-selfheal      2>/dev/null || true
 chmod +x /usr/local/bin/bee-boot-status  2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -0,0 +1,117 @@
 #!/bin/sh
 # 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
 set -e
 echo "=== generating bee wallpaper ==="
 mkdir -p /usr/share/bee
 python3 - <<'PYEOF'
 from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import os
 W, H = 1920, 1080
 GLYPHS = {
    'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
    'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
    'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
    'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
    'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
    '-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
 }
 TITLE = "EASY-BEE"
 SUBTITLE = "Hardware Audit LiveCD"
 CELL = 30
 GLYPH_GAP = 18
 ROW_GAP = 6
 FG = (0xF6, 0xD0, 0x47)
 FG_DIM = (0xD4, 0xA9, 0x1C)
 SHADOW = (0x5E, 0x47, 0x05)
 SUB = (0x96, 0x7A, 0x17)
 BG = (0x05, 0x05, 0x05)
 SUB_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
 ]
 def load_font(size):
    for path in SUB_FONT_CANDIDATES:
        if os.path.exists(path):
            return ImageFont.truetype(path, size)
    return ImageFont.load_default()
 def glyph_width(ch):
    return len(GLYPHS[ch][0])
 def render_logo_mask():
    width_cells = 0
    for idx, ch in enumerate(TITLE):
        width_cells += glyph_width(ch)
        if idx != len(TITLE) - 1:
            width_cells += 1
    mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
    mask_h = 7 * CELL + 6 * ROW_GAP
    mask = Image.new('L', (mask_w, mask_h), 0)
    draw = ImageDraw.Draw(mask)
    cx = 0
    for idx, ch in enumerate(TITLE):
        glyph = GLYPHS[ch]
        for row_idx, row in enumerate(glyph):
            for col_idx, cell in enumerate(row):
                if cell != '1':
                    continue
                x0 = cx + col_idx * CELL
                y0 = row_idx * (CELL + ROW_GAP)
                x1 = x0 + CELL - 4
                y1 = y0 + CELL - 4
                draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
        cx += glyph_width(ch) * CELL
        if idx != len(TITLE) - 1:
            cx += CELL + GLYPH_GAP
    return mask
 img = Image.new('RGB', (W, H), BG)
 draw = ImageDraw.Draw(img)
 # Soft amber glow under the logo without depending on font rendering.
 glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
 glow_draw = ImageDraw.Draw(glow)
 glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
 glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)
 logo_mask = render_logo_mask()
 logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
 logo_y = 290
 shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
 img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
 img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
 img.paste(FG, (logo_x, logo_y), logo_mask)
 font_sub = load_font(30)
 sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
 sub_y = logo_y + logo_h + 54
 draw = ImageDraw.Draw(img)
 draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
 draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
 img = img.convert('RGB')
 img.save('/usr/share/bee/wallpaper.png', optimize=True)
 print('wallpaper written: /usr/share/bee/wallpaper.png')
 PYEOF
 echo "=== wallpaper done ==="
--- a/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
+++ b/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
@@ -0,0 +1,41 @@
 #!/bin/sh
 # 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
 #
 # live-boot tries "losetup --replace --direct-io=on" when re-associating the
 # loop device to the RAM copy in /dev/shm.  tmpfs does not support O_DIRECT,
 # so the ioctl returns EINVAL and the verification step fails.
 #
 # The patch replaces the replace call so that if --direct-io=on fails it falls
 # back to a plain replace without direct-io, and also relaxes the verification
 # to a warning so the boot continues even when re-association is imperfect.
 set -e
 TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
 if [ ! -f "${TORAM_SCRIPT}" ]; then
    echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
    exit 0
 fi
 echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
 # Replace any losetup --replace call that includes --direct-io=on with a
 # version that first tries with direct-io, then retries without it.
 #
 # The sed expression turns:
 #   losetup --replace ... --direct-io=on LOOP FILE
 # into a shell snippet that tries both, silently.
 #
 # We also downgrade the fatal "Task finished with error." block to a warning
 # so the boot continues if re-association fails (squashfs still accessible).
 # 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
 sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
 sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
 # 2. Turn the hard error into a warning so boot continues.
 #    live-boot prints this exact string when verification fails.
 sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
 echo "9010-fix-toram: patch applied"
 grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
--- a/iso/builder/config/package-lists/bee-nvidia.list.chroot
+++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot
@@ -1,6 +1,10 @@
-# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
+# NVIDIA DCGM (Data Center GPU Manager).
-# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
+# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
-# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
+# NVIDIA max-compute recipe. The smoketest/runtime contract treats
 # dcgmproftester as required in the LiveCD.
 # DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
 # CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
 # explicitly.
 datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -60,9 +60,15 @@ qrencode
 # Local desktop (openbox + chromium kiosk)
 openbox
 tint2
 feh
 python3-pil
 xorg
 xterm
 chromium
 mousepad
 pcmanfm
 ristretto
 mupdf
 xserver-xorg-video-fbdev
 xserver-xorg-video-vesa
 lightdm
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -27,6 +27,7 @@ echo ""
 KVER=$(uname -r)
 info "kernel: $KVER"
 NVIDIA_BOOT_MODE="normal"
 NVIDIA_MODULES_FLAVOR="proprietary"
 for arg in $(cat /proc/cmdline 2>/dev/null); do
    case "$arg" in
        bee.nvidia.mode=*)
@@ -34,7 +35,11 @@ for arg in $(cat /proc/cmdline 2>/dev/null); do
            ;;
    esac
 done
 if [ -f /etc/bee-nvidia-modules-flavor ]; then
    NVIDIA_MODULES_FLAVOR="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null || echo proprietary)"
 fi
 info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
 info "nvidia modules flavor: ${NVIDIA_MODULES_FLAVOR}"
 # --- PATH & binaries ---
 echo "-- PATH & binaries --"
@@ -52,6 +57,31 @@ else
    fail "nvidia-smi: NOT FOUND"
 fi
 if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
    ok "dcgmi found: $p"
 else
    fail "dcgmi: NOT FOUND"
 fi
 if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
    ok "nv-hostengine found: $p"
 else
    fail "nv-hostengine: NOT FOUND"
 fi
 DCGM_PROFTESTER=""
 for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
        DCGM_PROFTESTER="$p"
        break
    fi
 done
 if [ -n "$DCGM_PROFTESTER" ]; then
    ok "dcgmproftester found: $DCGM_PROFTESTER"
 else
    fail "dcgmproftester: NOT FOUND"
 fi
 for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
        ok "$tool found: $p"
@@ -60,6 +90,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
    fi
 done
 if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
    ok "nvbandwidth found: $p"
 else
    warn "nvbandwidth: NOT FOUND"
 fi
 echo ""
 echo "-- NVIDIA modules --"
 KO_DIR="/usr/local/lib/nvidia"
@@ -79,10 +115,12 @@ fi
 for mod in nvidia_modeset nvidia_uvm; do
    if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
        ok "module loaded: $mod"
-    elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
        fail "module NOT loaded in normal mode: $mod"
-    else
+    elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
        warn "module not loaded in GSP-off mode: $mod"
    else
        fail "module NOT loaded: $mod"
    fi
 done
@@ -98,10 +136,12 @@ done
 if [ -e /dev/nvidia-uvm ]; then
    ok "/dev/nvidia-uvm exists"
-elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
    fail "/dev/nvidia-uvm missing in normal mode"
-else
+elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
    warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
 else
    fail "/dev/nvidia-uvm missing"
 fi
 echo ""
@@ -171,6 +211,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
    fi
 done
 if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
    ok "timer active: bee-selfheal.timer"
 else
    fail "timer NOT active: bee-selfheal.timer"
 fi
 echo ""
 echo "-- runtime health --"
 if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -1,7 +1,6 @@
 [Unit]
 Description=Bee: hardware audit
 After=bee-preflight.service bee-network.service bee-nvidia.service
 Before=bee-web.service
 [Service]
 Type=oneshot
--- a/iso/overlay/etc/systemd/system/bee-boot-status.service
+++ b/iso/overlay/etc/systemd/system/bee-boot-status.service
@@ -0,0 +1,18 @@
 [Unit]
 Description=Bee: boot status display
 After=systemd-user-sessions.service
 Before=getty@tty1.service
 [Service]
 Type=oneshot
 RemainAfterExit=no
 ExecStart=/usr/local/bin/bee-boot-status
 TTYPath=/dev/tty1
 StandardInput=tty
 StandardOutput=tty
 StandardError=tty
 TTYReset=yes
 TTYVHangup=yes
 [Install]
 WantedBy=multi-user.target
--- a/iso/overlay/etc/systemd/system/bee-selfheal.service
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.service
@@ -0,0 +1,9 @@
 [Unit]
 Description=Bee: periodic runtime self-heal
 After=bee-web.service bee-audit.service bee-preflight.service
 [Service]
 Type=oneshot
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
 StandardOutput=journal
 StandardError=journal
--- a/iso/overlay/etc/systemd/system/bee-selfheal.timer
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.timer
@@ -0,0 +1,11 @@
 [Unit]
 Description=Bee: run self-heal checks periodically
 [Timer]
 OnBootSec=45sec
 OnUnitActiveSec=60sec
 AccuracySec=15sec
 Unit=bee-selfheal.service
 [Install]
 WantedBy=timers.target
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -1,12 +1,12 @@
 [Unit]
 Description=Bee: hardware audit web viewer
-After=bee-audit.service
+StartLimitIntervalSec=0
 [Service]
 Type=simple
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
 Restart=always
-RestartSec=2
+RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
--- a/iso/overlay/etc/systemd/system/getty@tty1.service.d/wait-bee.conf
+++ b/iso/overlay/etc/systemd/system/getty@tty1.service.d/wait-bee.conf
@@ -0,0 +1,2 @@
 [Unit]
 After=bee-boot-status.service
--- a/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
+++ b/iso/overlay/etc/systemd/system/lightdm.service.d/bee-display-mode.conf
@@ -1,6 +1,4 @@
 [Unit]
 Wants=bee-preflight.service
 After=bee-preflight.service
 [Service]
 ExecStartPre=/usr/local/bin/bee-display-mode
--- a/iso/overlay/usr/local/bin/bee-boot-status
+++ b/iso/overlay/usr/local/bin/bee-boot-status
@@ -0,0 +1,89 @@
 #!/bin/sh
 # bee-boot-status — boot progress display on tty1.
 # Shows live service status until all bee services are done or failed,
 # then exits so getty can show the login prompt.
 CRITICAL="bee-preflight bee-nvidia bee-audit"
 ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
 svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
 svc_icon() {
    case "$(svc_state "$1")" in
        active)       printf '\033[32m[  OK  ]\033[0m' ;;
        failed)       printf '\033[31m[ FAIL ]\033[0m' ;;
        activating)   printf '\033[33m[  ..  ]\033[0m' ;;
        deactivating) printf '\033[33m[ stop ]\033[0m' ;;
        inactive)     printf '\033[90m[      ]\033[0m' ;;
        *)            printf '\033[90m[  ?   ]\033[0m' ;;
    esac
 }
 svc_detail() {
    local svc="$1" state
    state="$(svc_state "$svc")"
    case "$state" in
        failed)
            local res
            res="$(systemctl show -p Result "$svc.service" 2>/dev/null | cut -d= -f2)"
            [ -n "$res" ] && [ "$res" != "success" ] && printf '  \033[31m(%s)\033[0m' "$res"
            ;;
        activating)
            local line
            line="$(journalctl -u "$svc.service" -n 1 --no-pager --output=cat 2>/dev/null | cut -c1-55)"
            [ -n "$line" ] && printf '  \033[90m%s\033[0m' "$line"
            ;;
    esac
 }
 all_critical_done() {
    for svc in $CRITICAL; do
        case "$(svc_state "$svc")" in
            active|failed|inactive) ;;
            *) return 1 ;;
        esac
    done
    return 0
 }
 while true; do
    # move to top-left and clear screen
    printf '\033[H\033[2J'
    printf '\n'
    printf '  \033[33m███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗\033[0m\n'
    printf '  \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝\033[0m\n'
    printf '  \033[33m█████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗\033[0m\n'
    printf '  \033[33m██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝\033[0m\n'
    printf '  \033[33m███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗\033[0m\n'
    printf '  \033[33m╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
    printf '  Hardware Audit LiveCD\n'
    printf '\n'
    for svc in $ALL; do
        printf '  %s  %-20s%s\n' "$(svc_icon "$svc")" "$svc" "$(svc_detail "$svc")"
    done
    printf '\n'
    # Network
    ips="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{printf "  %-16s %s\n", $NF, $2}')"
    if [ -n "$ips" ]; then
        printf '  \033[1mNetwork:\033[0m\n'
        printf '%s\n' "$ips"
        printf '\n'
    fi
    if all_critical_done; then
        printf '  \033[1;32mSystem ready.\033[0m  Audit is running in the background.\n'
        first_ip="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -1)"
        if [ -n "$first_ip" ]; then
            printf '  Web UI: \033[1mhttp://%s/\033[0m\n' "$first_ip"
        fi
        printf '\n'
        sleep 3
        break
    fi
    printf '  \033[90mStarting up...\033[0m\n'
    sleep 3
 done
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -62,6 +62,8 @@ done
 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"
 export CUDA_DEVICE_ORDER="PCI_BUS_ID"
 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
        fi
    fi
    echo "starting gpu ${id} size=${gpu_size_mb}MB"
-    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+    CUDA_VISIBLE_DEVICES="${id}" \
        "${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
 done
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -1,10 +1,11 @@
 #!/bin/sh
 set -eu
-SECONDS=300
+DURATION_SEC=300
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
 TEST_SLICE_SECONDS=300
 JOHN_DIR="/usr/local/lib/bee/john/run"
 JOHN_BIN="${JOHN_DIR}/john"
 export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
@@ -116,7 +117,7 @@ ensure_opencl_ready() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
-        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -151,14 +152,19 @@ done
 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
 export CUDA_DEVICE_ORDER="PCI_BUS_ID"
 export CUDA_VISIBLE_DEVICES="${FINAL}"
 JOHN_DEVICES=""
 local_id=1
 for id in $(echo "${FINAL}" | tr ',' ' '); do
-    opencl_id=$((id + 1))
+    opencl_id="${local_id}"
    if [ -z "${JOHN_DEVICES}" ]; then
        JOHN_DEVICES="${opencl_id}"
    else
        JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
    fi
    local_id=$((local_id + 1))
 done
 echo "loader=john"
@@ -189,14 +195,51 @@ CHOSEN_FORMAT=$(choose_format) || {
    exit 1
 }
-echo "format=${CHOSEN_FORMAT}"
+run_john_loop() {
    opencl_id="$1"
    deadline="$2"
    round=0
    while :; do
        now=$(date +%s)
        remaining=$((deadline - now))
        if [ "${remaining}" -le 0 ]; then
            break
        fi
        round=$((round + 1))
        slice="${remaining}"
        if [ "${slice}" -gt "${TEST_SLICE_SECONDS}" ]; then
            slice="${TEST_SLICE_SECONDS}"
        fi
        echo "device=${opencl_id} round=${round} remaining_sec=${remaining} slice_sec=${slice}"
        ./john --test="${slice}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" || return 1
    done
 }
 PIDS=""
 cleanup() {
    rc=$?
    trap - EXIT INT TERM
    for pid in ${PIDS}; do
        kill "${pid}" 2>/dev/null || true
    done
    for pid in ${PIDS}; do
        wait "${pid}" 2>/dev/null || true
    done
    exit "${rc}"
 }
 trap cleanup EXIT INT TERM
 echo "format=${CHOSEN_FORMAT}"
 echo "target_seconds=${DURATION_SEC}"
 echo "slice_seconds=${TEST_SLICE_SECONDS}"
 DEADLINE=$(( $(date +%s) + DURATION_SEC ))
 _first=1
 for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
    [ "${_first}" = "1" ] || sleep 3
    _first=0
-    ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
+    run_john_loop "${opencl_id}" "${DEADLINE}" &
-    PIDS="${PIDS} $!"
+    pid=$!
    PIDS="${PIDS} ${pid}"
 done
 FAIL=0
 for pid in ${PIDS}; do
--- a/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
 echo "range=${MIN_BYTES}..${MAX_BYTES}"
 echo "iters=${ITERS}"
 export CUDA_DEVICE_ORDER="PCI_BUS_ID"
 deadline=$(( $(date +%s) + SECONDS ))
 round=0
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -6,6 +6,19 @@ NVIDIA_KO_DIR="/usr/local/lib/nvidia"
 log() { echo "[bee-nvidia] $*"; }
 read_nvidia_modules_flavor() {
    if [ -f /etc/bee-nvidia-modules-flavor ]; then
        flavor="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null)"
        case "$flavor" in
            open|proprietary)
                echo "$flavor"
                return 0
                ;;
        esac
    fi
    echo "proprietary"
 }
 log "kernel: $(uname -r)"
 # Skip if no NVIDIA GPU present (PCI vendor 10de)
@@ -40,6 +53,8 @@ if [ -z "$nvidia_mode" ]; then
    nvidia_mode="normal"
 fi
 log "boot mode: $nvidia_mode"
 nvidia_modules_flavor="$(read_nvidia_modules_flavor)"
 log "modules flavor: $nvidia_modules_flavor"
 load_module() {
    mod="$1"
@@ -50,11 +65,93 @@ load_module() {
        log "WARN: not found: $ko"
        return 1
    fi
-    if insmod "$ko" "$@"; then
+    if timeout 90 insmod "$ko" "$@"; then
        log "loaded: $mod $*"
        return 0
    fi
-    log "WARN: failed to load: $mod"
+    log "WARN: failed to load: $mod (exit $?)"
    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
    return 1
 }
 nvidia_is_functional() {
    grep -q ' nvidiactl$' /proc/devices 2>/dev/null
 }
 load_module_with_gsp_fallback() {
    ko="$NVIDIA_KO_DIR/nvidia.ko"
    if [ ! -f "$ko" ]; then
        log "ERROR: not found: $ko"
        return 1
    fi
    # Run insmod in background — on some converted SXM→PCIe cards GSP enters an
    # infinite crash/reload loop and insmod never returns. We check for successful
    # initialization by polling /proc/devices for nvidiactl instead of waiting for
    # insmod to exit.
    log "loading nvidia (GSP enabled, timeout 90s)"
    insmod "$ko" &
    _insmod_pid=$!
    _waited=0
    while [ $_waited -lt 90 ]; do
        if nvidia_is_functional; then
            log "loaded: nvidia (GSP enabled, ${_waited}s)"
            echo "gsp-on" > /run/bee-nvidia-mode
            return 0
        fi
        # Check if insmod exited with an error before timeout
        if ! kill -0 "$_insmod_pid" 2>/dev/null; then
            wait "$_insmod_pid"
            _rc=$?
            if [ $_rc -ne 0 ]; then
                log "nvidia load failed (exit $_rc)"
                dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
                return 1
            fi
            # insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
            sleep 2
            if nvidia_is_functional; then
                log "loaded: nvidia (GSP enabled, ${_waited}s)"
                return 0
            fi
            log "insmod exited 0 but nvidiactl missing — treating as failure"
            return 1
        fi
        sleep 1
        _waited=$((_waited + 1))
    done
    # GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
    log "nvidia GSP init timed out after 90s"
    kill "$_insmod_pid" 2>/dev/null || true
    wait "$_insmod_pid" 2>/dev/null || true
    # Attempt to unload the partially-initialized module
    if ! rmmod nvidia 2>/dev/null; then
        # Module is stuck in the kernel — cannot reload with different params.
        # User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
        log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
        log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
        echo "gsp-stuck" > /run/bee-nvidia-mode
        return 1
    fi
    sleep 2
    log "retrying with NVreg_EnableGpuFirmware=0"
    log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
    if insmod "$ko" NVreg_EnableGpuFirmware=0; then
        if nvidia_is_functional; then
            log "loaded: nvidia (GSP disabled)"
            echo "gsp-off" > /run/bee-nvidia-mode
            return 0
        fi
        log "insmod gsp-off exited 0 but nvidiactl missing"
        return 1
    fi
    log "nvidia load failed (GSP=off)"
    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
    return 1
 }
@@ -68,37 +165,54 @@ load_host_module() {
    return 1
 }
-case "$nvidia_mode" in
+if [ "$nvidia_modules_flavor" = "open" ]; then
-    normal|full)
+    case "$nvidia_mode" in
-        if ! load_module nvidia; then
+        gsp-off|safe|nomsi)
-            exit 1
+            log "ignoring boot mode ${nvidia_mode} for open NVIDIA modules"
-        fi
+            ;;
-        # nvidia-modeset on some server kernels needs ACPI video helper symbols
+    esac
-        # exported by the generic "video" module. Best-effort only; compute paths
+    if ! load_module nvidia; then
-        # remain functional even if display-related modules stay absent.
+        exit 1
-        load_host_module video || true
+    fi
-        load_module nvidia-modeset || true
+    # nvidia-modeset on some server kernels needs ACPI video helper symbols
-        load_module nvidia-uvm || true
+    # exported by the generic "video" module. Best-effort only; compute paths
-        ;;
+    # remain functional even if display-related modules stay absent.
-    gsp-off|safe)
+    load_host_module video || true
-        # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
+    load_module nvidia-modeset || true
-        # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
+    load_module nvidia-uvm || true
-        # conservative path for platforms where full boot-time GSP init is unstable.
+else
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
+    case "$nvidia_mode" in
-            exit 1
+        normal|full)
-        fi
+            if ! load_module_with_gsp_fallback; then
-        log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
+                exit 1
-        ;;
+            fi
-    nomsi|*)
+            # nvidia-modeset on some server kernels needs ACPI video helper symbols
-        # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
+            # exported by the generic "video" module. Best-effort only; compute paths
-        # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
+            # remain functional even if display-related modules stay absent.
-        # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
+            load_host_module video || true
-        if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
+            load_module nvidia-modeset || true
-            exit 1
+            load_module nvidia-uvm || true
-        fi
+            ;;
-        log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
+        gsp-off|safe)
-        ;;
+            # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
-esac
+            # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
            # conservative path for platforms where full boot-time GSP init is unstable.
            if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
                exit 1
            fi
            log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
            ;;
        nomsi|*)
            # nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
            # "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
            # NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
            if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
                exit 1
            fi
            log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
            ;;
    esac
 fi
 # Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
 nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
@@ -127,14 +241,45 @@ fi
 ldconfig 2>/dev/null || true
 log "ldconfig refreshed"
 # Keep persistence mode enabled across the session so dcgmi / stress tools do
 # not fail with deployment warnings on otherwise healthy GPUs.
 if command -v nvidia-smi >/dev/null 2>&1; then
    if nvidia-smi -pm 1 >/dev/null 2>&1; then
        log "enabled NVIDIA persistence mode"
    else
        log "WARN: failed to enable NVIDIA persistence mode"
    fi
 else
    log "WARN: nvidia-smi not found — cannot enable persistence mode"
 fi
 # Start DCGM host engine so dcgmi can discover GPUs.
-# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
+# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
-# "group is empty" even when GPUs and modules are present.
+# If it started too early (for example via systemd before bee-nvidia-load), it can
-# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
+# keep a stale empty inventory and dcgmi diag later reports no testable entities.
 if command -v nv-hostengine >/dev/null 2>&1; then
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
-        log "nv-hostengine already running — skipping"
+        if command -v pkill >/dev/null 2>&1; then
-    else
+            pkill -x nv-hostengine >/dev/null 2>&1 || true
            tries=0
            while pgrep -x nv-hostengine >/dev/null 2>&1; do
                tries=$((tries + 1))
                if [ "${tries}" -ge 10 ]; then
                    log "WARN: nv-hostengine is still running after restart request"
                    break
                fi
                sleep 1
            done
            if pgrep -x nv-hostengine >/dev/null 2>&1; then
                log "WARN: keeping existing nv-hostengine process"
            else
                log "nv-hostengine restarted"
            fi
        else
            log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
        fi
    fi
    if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        nv-hostengine
        log "nv-hostengine started"
    fi
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -7,16 +7,24 @@ xset s off
 xset -dpms
 xset s noblank
 # Set desktop background.
 if [ -f /usr/share/bee/wallpaper.png ]; then
    feh --bg-fill /usr/share/bee/wallpaper.png
 else
    xsetroot -solid '#f6c90e'
 fi
 tint2 &
-# Wait up to 120s for bee-web to bind. The web server starts immediately now
+# Wait up to 60s for bee-web before opening Chromium.
-# (audit is deferred), so this should succeed in a few seconds on most hardware.
+# Without this Chromium gets connection-refused and shows a blank page.
-i=0
+_i=0
-while [ $i -lt 120 ]; do
+while [ $_i -lt 60 ]; do
-    if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
+    curl -sf http://localhost/healthz >/dev/null 2>&1 && break
    sleep 1
-    i=$((i+1))
+    _i=$((_i+1))
 done
 unset _i
 chromium \
    --disable-infobars \
@@ -24,7 +32,8 @@ chromium \
    --no-first-run \
    --disable-session-crashed-bubble \
    --disable-features=TranslateUI \
    --user-data-dir=/tmp/bee-chrome \
    --start-maximized \
-    http://localhost/ &
+    http://localhost/loading &
 exec openbox
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -0,0 +1,99 @@
 #!/bin/bash
 # bee-selfheal — periodic best-effort recovery for critical live ISO services.
 set -u
 LOG_PREFIX="bee-selfheal"
 EXPORT_DIR="/appdata/bee/export"
 AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
 RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
 LOCK_DIR="/run/bee-selfheal.lock"
 log() {
    echo "[${LOG_PREFIX}] $*"
 }
 have_nvidia_gpu() {
    lspci -nn 2>/dev/null | grep -qi '10de:'
 }
 service_active() {
    systemctl is-active --quiet "$1" 2>/dev/null
 }
 restart_service() {
    local svc="$1"
    if systemctl restart "$svc" >/dev/null 2>&1; then
        log "restarted ${svc}"
        return 0
    fi
    log "WARN: failed to restart ${svc}"
    return 1
 }
 file_ready() {
    [ -s "$1" ]
 }
 artifact_state() {
    local path="$1"
    if [ -s "${path}" ]; then
        echo "ready"
        return 0
    fi
    if [ -e "${path}.tmp" ]; then
        echo "interrupted"
        return 0
    fi
    echo "missing"
 }
 web_healthy() {
    bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
        >/dev/null 2>&1
 }
 mkdir -p "${EXPORT_DIR}" /run
 if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
    log "another self-heal run is already active"
    exit 0
 fi
 trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
 log "start"
 if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
    log "NVIDIA GPU detected but /dev/nvidia0 is missing"
    restart_service bee-nvidia.service || true
 fi
 runtime_state="$(artifact_state "${RUNTIME_JSON}")"
 if [ "${runtime_state}" != "ready" ]; then
    if [ "${runtime_state}" = "interrupted" ]; then
        log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
    else
        log "runtime-health.json missing or empty"
    fi
    restart_service bee-preflight.service || true
 fi
 audit_state="$(artifact_state "${AUDIT_JSON}")"
 if [ "${audit_state}" != "ready" ]; then
    if [ "${audit_state}" = "interrupted" ]; then
        log "bee-audit.json.tmp exists — interrupted audit write detected"
    else
        log "bee-audit.json missing or empty"
    fi
    restart_service bee-audit.service || true
 fi
 if ! service_active bee-web.service; then
    log "bee-web.service is not active"
    restart_service bee-web.service || true
 elif ! web_healthy; then
    log "bee-web health check failed"
    restart_service bee-web.service || true
 fi
 log "done"
Author	SHA1	Message	Date
Mikhail Chusavitin	93cfa78e8c	Benchmark: parallel GPU mode, resilient inventory query, server model in results - Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-07 18:32:15 +03:00
Mikhail Chusavitin	1358485f2b	fix logo wallpaper	2026-04-07 10:15:38 +03:00
Michael Chus	8fe20ba678	Fix benchmark scoring: PowerSustain uses default power limit PowerSustainScore now uses DefaultPowerLimitW as reference so a manually reduced power limit does not inflate the score. Falls back to enforced limit if default is unavailable. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:30:59 +03:00
Michael Chus	d973231f37	Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check - Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 22:26:52 +03:00
Michael Chus	f5d175f488	Fix toram: patch live-boot to not use O_DIRECT when replacing loop to tmpfs losetup --replace --direct-io=on fails with EINVAL when the target file is on tmpfs (/dev/shm), because tmpfs does not support O_DIRECT. Strip the --direct-io flag from the replace call and downgrade the verification failure to a warning so boot continues. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:21 +03:00
Michael Chus	fa00667750	Refactor NVIDIA GPU Selection into standalone card on validate page Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-06 21:06:16 +03:00
Mikhail Chusavitin	c7d2816a7f	Limit NVIDIA legacy boot hooks to proprietary ISO	2026-04-06 16:33:16 +03:00
Mikhail Chusavitin	d2eadedff2	Default NVIDIA ISO to open modules and add nvidia-legacy	2026-04-06 16:27:13 +03:00
Mikhail Chusavitin	a98c4d7461	Include terminal charts in benchmark report	2026-04-06 12:34:57 +03:00
Mikhail Chusavitin	2354ae367d	Normalize task IDs and artifact folder prefixes	2026-04-06 12:26:47 +03:00
Mikhail Chusavitin	0d0e1f55a7	Avoid misleading SAT summaries after task cancellation	2026-04-06 12:24:19 +03:00
Mikhail Chusavitin	35f4c53887	Stabilize NVIDIA GPU device mapping across loaders	2026-04-06 12:22:04 +03:00
Mikhail Chusavitin	981315e6fd	Split NVIDIA tasks by homogeneous GPU groups	2026-04-06 11:58:13 +03:00
Mikhail Chusavitin	fc5c100a29	Fix NVIDIA persistence mode and add benchmark results table	2026-04-06 10:47:07 +03:00
Michael Chus	6e94216f3b	Hide task charts while pending	2026-04-05 22:34:34 +03:00
Michael Chus	53455063b9	Stabilize live task detail page	2026-04-05 22:14:52 +03:00
Michael Chus	4602f97836	Enforce sequential task orchestration	2026-04-05 22:10:42 +03:00
Michael Chus	c65d3ae3b1	Add nomodeset to default GRUB entry — fix black screen on headless servers Servers with NVIDIA compute GPUs (H100 etc.) have no display output, so KMS blanks the console. nomodeset disables kernel modesetting and lets the NVIDIA proprietary driver handle display via Xorg. KMS variant moved to advanced submenu for cases where it is needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 21:40:47 +03:00
Michael Chus	7a21c370e4	Handle NVIDIA GSP firmware init hang with timeout fallback - bee-nvidia-load: run insmod in background, poll /proc/devices for nvidiactl; if GSP init doesn't complete in 90s, kill insmod and retry with NVreg_EnableGpuFirmware=0. Handles EBUSY case with clear error. - Write /run/bee-nvidia-mode (gsp-on/gsp-off/gsp-stuck) for audit layer - Show GSP mode badge in sidebar: yellow for gsp-off, red for gsp-stuck - Report NvidiaGSPMode in RuntimeHealth with issue entries - Simplify GRUB menu: default (KMS+GSP), advanced submenu (GSP=off, nomodeset, fail-safe), remove load-to-RAM entry - Add pcmanfm, ristretto, mupdf, mousepad to desktop packages Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 21:00:43 +03:00
Michael Chus	a493e3ab5b	Fix service control buttons: sudo, real error output, UX feedback - services.go: use sudo systemctl so bee user can control system services - api.go: always return 200 with output field even on error, so the frontend shows the actual systemctl message instead of "exit status 1" - pages.go: button shows "..." while pending then restores label; output panel is full-width under the table with ✓/✗ status indicator; output auto-scrolls to bottom Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:25:41 +03:00
Michael Chus	19b4803ec7	Pass exact cycle duration to GPU stress instead of 86400s sentinel bee-gpu-burn now receives --seconds <LoadSec> so it exits naturally when the cycle ends, rather than relying solely on context cancellation to kill it. Process group kill (Setpgid+Cancel) is kept as a safety net for early cancellation (user stop, context timeout). Same fix for AMD RVS which now gets duration_ms = LoadSec * 1000. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:22:43 +03:00
Michael Chus	1bdfb1e9ca	Fix nvidia-targeted-stress failing with DCGM_ST_IN_USE (-34) nvvs (DCGM validation suite) survives when dcgmi is killed mid-run, leaving the GPU occupied. The next dcgmi diag invocation then fails with "affected resource is in use". Two-part fix: - Add nvvs and dcgmi to KillTestWorkers patterns so they are cleaned up by the global cancel handler - Call KillTestWorkers at the start of RunNvidiaTargetedStressValidatePack to clear any stale processes before dcgmi diag runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:21:36 +03:00
Michael Chus	c5d6b30177	Fix platform thermal cycling leaving GPU load running after test ends bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children. exec.CommandContext default cancel only kills the shell parent; the worker processes survive and keep loading the GPU indefinitely. Fix: set Setpgid=true and a custom Cancel that sends SIGKILL to the entire process group (-pid), same pattern already used in runSATCommandCtx. Applied to Nvidia, AMD, and CPU stress commands for consistency. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:19:20 +03:00
Michael Chus	5b9015451e	Add live task charts and fix USB export actions	2026-04-05 20:14:23 +03:00
Michael Chus	d1a6863ceb	Use amber fallback wallpaper color (#f6c90e) instead of black Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:30:41 +03:00
Michael Chus	f9aa05de8e	Add wallpaper: black background with amber EASY-BEE ASCII art logo - Add feh and python3-pil to package list - Add chroot hook that generates /usr/share/bee/wallpaper.png using PIL: black background, EASY-BEE box-drawing logo in amber (#f6c90e), "Hardware Audit LiveCD" subtitle in dim amber — matches motd exactly - bee-openbox-session: set wallpaper with feh --bg-fill, fall back to xsetroot -solid black if wallpaper not found Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:29:42 +03:00
Michael Chus	a9ccea8cca	Fix black desktop and Chromium blank page on startup - Set xsetroot solid background (#12100a, dark amber) so openbox doesn't show bare black before Chromium opens - Re-add healthz wait loop before launching Chromium: without it Chromium opens localhost/loading before bee-web is up and gets connection-refused which renders as a blank white page Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:25:32 +03:00
Michael Chus	fc5c985fb5	Reset tty1 properly when bee-boot-status exits Add TTYReset=yes and TTYVHangup=yes so systemd restores the terminal to a clean state before handing tty1 to getty. Without this the screen went black with no cursor after the status display finished. Also remove DefaultDependencies=no which was too aggressive. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:22:01 +03:00
Michael Chus	5eb3baddb4	Fix bee-boot-status blank screen caused by variable buffering Command substitution in sh strips trailing newlines, so accumulating output in a variable via $(...) lost all line breaks. Reverted to direct printf calls which work correctly. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:21:10 +03:00
Michael Chus	a6ac13b5d3	Improve bee-boot-status: slower refresh, more detail - Refresh every 3s instead of 1s to reduce flicker - Show ssh, bee-sshsetup in service list - Show failure reason for failed services - Show last journal line for activating services - Show IP addresses and web UI URL when network is up - Render frame to variable before printing to reduce flicker Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:20:07 +03:00
Michael Chus	4003cb7676	Lower kernel console loglevel to 3 to reduce boot noise loglevel=6 floods the screen with mpt3sas/scsi/sd informational messages, hiding systemd service status and bee-boot-status display. loglevel=3 shows only kernel errors; all messages still go to serial. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 19:19:09 +03:00
Michael Chus	2875313ba0	Improve boot UX: status display, faster GUI, loading spinner - Add bee-boot-status service: shows live service status on tty1 with ASCII logo before getty, exits when all bee services settle - Remove lightdm dependency on bee-preflight so GUI starts immediately without waiting for NVIDIA driver load - Replace Chromium blank-page problem with /loading spinner page that polls /api/services and auto-redirects when services are ready; add "Open app now" override button; use fresh --user-data-dir=/tmp/bee-chrome - Unify branding: add "Hardware Audit LiveCD" subtitle to GRUB menu, bee-boot-status (with yellow ASCII logo), and web spinner Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 18:58:24 +03:00
Michael Chus	f1621efee4	Mirror task lifecycle to serial console	2026-04-05 18:34:06 +03:00
Michael Chus	4461249cc3	Make memory stress size follow available RAM	2026-04-05 18:33:26 +03:00
Michael Chus	e609fbbc26	Add task reports and streamline GPU charts	2026-04-05 18:13:58 +03:00
Michael Chus	cc2b49ea41	Improve validate GPU runs and web UI feedback	2026-04-05 17:50:13 +03:00
Michael Chus	33e0a5bef2	Refine validate UI and runtime health table	2026-04-05 16:24:45 +03:00
Michael Chus	38e79143eb	Refine burn UI and NVIDIA stress flows	2026-04-05 13:43:43 +03:00
Michael Chus	25af2df23a	Unify metrics charts on custom SVG renderer	2026-04-05 12:17:50 +03:00
Michael Chus	20abff7f90	WIP: checkpoint current tree	2026-04-05 12:05:00 +03:00
Michael Chus	a14ec8631c	Persist GPU chart mode and expand GPU charts	2026-04-05 11:52:32 +03:00
Michael Chus	f58c7e58d3	Fix webui streaming recovery regressions	2026-04-05 10:39:09 +03:00
Michael Chus	bf47c8dbd2	Add NVIDIA benchmark reporting flow	2026-04-05 10:30:56 +03:00
Michael Chus	143b7dca5d	Add stability hardening and self-heal recovery	2026-04-05 10:29:37 +03:00
Michael Chus	9826d437a5	Add GPU clock charts and grouped GPU metrics view	2026-04-05 09:57:38 +03:00
Mikhail Chusavitin	f3c14cd893	Harden NIC probing for empty SFP ports	2026-04-04 15:23:15 +03:00
Mikhail Chusavitin	728270dc8e	Unblock bee-web startup and expand support bundle diagnostics	2026-04-04 15:18:43 +03:00
Mikhail Chusavitin	8692f825bc	Use plain repo tags for build version	2026-04-03 10:48:51 +03:00
Mikhail Chusavitin	11f52ac710	Fix task log modal scrolling	2026-04-03 10:36:11 +03:00
Mikhail Chusavitin	1cb398fe83	Show tag version at top of sidebar	2026-04-03 10:08:00 +03:00
Mikhail Chusavitin	7a843be6b0	Stabilize DCGM GPU discovery	2026-04-03 09:50:33 +03:00
Mikhail Chusavitin	7f6386dccc	Restore USB support bundle export on tools page	2026-04-03 09:48:22 +03:00
Mikhail Chusavitin	eea2591bcc	Fix John GPU stress duration semantics	2026-04-03 09:46:16 +03:00
Mikhail Chusavitin	295a19b93a	feat(tasks): run all queued tasks in parallel Tasks are now started simultaneously when multiple are enqueued (e.g. Run All). The worker drains all pending tasks at once and launches each in its own goroutine, waiting via WaitGroup. kmsg watcher updated to use a shared event window with a reference counter across concurrent tasks. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 09:15:06 +03:00
Mikhail Chusavitin	444a7d16cc	fix(iso): increase boot verbosity for service startup visibility Raise loglevel from 3 to 6 (INFO) and add systemd.show_status=1 so kernel driver messages and systemd [ OK ]/[ FAILED ] lines are visible during boot instead of showing only a blank cursor. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:33:27 +03:00
Mikhail Chusavitin	fd722692a4	feat(watchdog): hardware error monitor + unified component status store - Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:20:59 +03:00