Add USB blackbox log mirroring service

Commit remaining workspace changes
Unify NVIDIA GPU recovery paths
2026-04-24 10:20:12 +03:00 · 2026-04-23 20:32:26 +03:00 · 2026-04-23 20:31:41 +03:00 · 2026-04-22 22:05:16 +03:00 · 2026-04-22 20:39:27 +03:00 · 2026-04-22 19:01:50 +03:00
129 changed files with 24516 additions and 3137 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@
 .DS_Store
 dist/
 iso/out/
 build-cache/
 audit/bee
--- a/audit/Makefile
+++ b/audit/Makefile
@@ -1,9 +1,10 @@
 LISTEN ?= :8080
 AUDIT_PATH ?=
 EXPORT_DIR ?= $(CURDIR)/.tmp/export
 VERSION ?= $(shell sh ./scripts/resolve-version.sh)
 GO_LDFLAGS := -X main.Version=$(VERSION)
-RUN_ARGS := web --listen $(LISTEN)
+RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
 ifneq ($(AUDIT_PATH),)
 RUN_ARGS += --audit-path $(AUDIT_PATH)
 endif
@@ -11,6 +12,7 @@ endif
 .PHONY: run build test
 run:
 	mkdir -p $(EXPORT_DIR)
 	go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
 build:
--- a/audit/bee
+++ b/audit/bee
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -2,11 +2,14 @@ package main
 import (
 	"context"
 	"errors"
 	"flag"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"runtime/debug"
 	"strconv"
 	"strings"
 	"bee/audit/internal/app"
@@ -29,10 +32,19 @@ func main() {
 	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
 }
-func run(args []string, stdout, stderr io.Writer) int {
+func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level: slog.LevelInfo,
 	})))
 	defer func() {
 		if rec := recover(); rec != nil {
 			slog.Error("fatal panic",
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 			exitCode = 1
 		}
 	}()
 	if len(args) == 0 {
 		printRootUsage(stderr)
@@ -56,8 +68,14 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return runSupportBundle(args[1:], stdout, stderr)
 	case "web":
 		return runWeb(args[1:], stdout, stderr)
 	case "blackbox":
 		return runBlackbox(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
 	case "benchmark":
 		return runBenchmark(args[1:], stdout, stderr)
 	case "bee-worker":
 		return runBeeWorker(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -74,8 +92,11 @@ func printRootUsage(w io.Writer) {
  bee preflight --output stdout|file:<path>
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
-  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
+  bee web     --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
  bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
  bee benchmark nvidia [--profile standard|stability|overnight]
  bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
  bee version
  bee help [command]`)
 }
@@ -92,8 +113,14 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runSupportBundle([]string{"--help"}, stdout, stdout)
 	case "web":
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "blackbox":
 		return runBlackbox([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
 	case "benchmark":
 		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "bee-worker":
 		return runBeeWorker([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -280,7 +307,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("web", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
-	auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
+	auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
 	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
 	title := fs.String("title", "Bee Hardware Audit", "page title")
 	fs.Usage = func() {
@@ -319,6 +346,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	return 0
 }
 func runBlackbox(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
 	statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
 	fs.Usage = func() {
 		fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
 		fs.PrintDefaults()
 	}
 	if err := fs.Parse(args); err != nil {
 		if err == flag.ErrHelp {
 			return 0
 		}
 		return 2
 	}
 	if fs.NArg() != 0 {
 		fs.Usage()
 		return 2
 	}
 	slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
 	if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
 		slog.Error("run blackbox", "err", err)
 		return 1
 	}
 	return 0
 }
 func runSAT(args []string, stdout, stderr io.Writer) int {
 	if len(args) == 0 {
 		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
@@ -366,9 +420,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 			archive, err = application.RunNvidiaAcceptancePack("", logLine)
 		}
 	case "memory":
-		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
 	case "storage":
-		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
+		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
 	case "cpu":
 		dur := *duration
 		if dur <= 0 {
@@ -383,3 +437,107 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
 func runBenchmark(args []string, stdout, stderr io.Writer) int {
 	if len(args) == 0 {
 		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 2
 	}
 	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
 		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 0
 	}
 	target := args[0]
 	if target != "nvidia" {
 		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
 		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
 		return 2
 	}
 	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
 	devices := fs.String("devices", "", "comma-separated GPU indices to include")
 	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
 	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
 	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
 	if err := fs.Parse(args[1:]); err != nil {
 		if err == flag.ErrHelp {
 			return 0
 		}
 		return 2
 	}
 	if fs.NArg() != 0 {
 		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
 		return 2
 	}
 	includeIndices, err := parseBenchmarkIndexCSV(*devices)
 	if err != nil {
 		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
 		return 2
 	}
 	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
 	if err != nil {
 		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
 		return 2
 	}
 	application := app.New(platform.New())
 	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
 	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
 		Profile:           *profile,
 		SizeMB:            *sizeMB,
 		GPUIndices:        includeIndices,
 		ExcludeGPUIndices: excludeIndices,
 		RunNCCL:           !*skipNCCL,
 	}, logLine)
 	if err != nil {
 		slog.Error("run benchmark", "target", target, "err", err)
 		return 1
 	}
 	slog.Info("benchmark archive written", "target", target, "path", archive)
 	return 0
 }
 func runBeeWorker(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
 	taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
 	fs.Usage = func() {
 		fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
 		fs.PrintDefaults()
 	}
 	if err := fs.Parse(args); err != nil {
 		if err == flag.ErrHelp {
 			return 0
 		}
 		return 2
 	}
 	if fs.NArg() != 0 {
 		fs.Usage()
 		return 2
 	}
 	return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
 }
 func parseBenchmarkIndexCSV(raw string) ([]int, error) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return nil, nil
 	}
 	var indices []int
 	for _, part := range strings.Split(raw, ",") {
 		part = strings.TrimSpace(part)
 		if part == "" {
 			continue
 		}
 		value, err := strconv.Atoi(part)
 		if err != nil || value < 0 {
 			return nil, fmt.Errorf("bad gpu index %q", part)
 		}
 		indices = append(indices, value)
 	}
 	return indices, nil
 }
--- a/audit/go.mod
+++ b/audit/go.mod
@@ -5,22 +5,18 @@ go 1.25.0
 replace reanimator/chart => ../internal/chart
 require (
-	github.com/go-analyze/charts v0.5.26
+	modernc.org/sqlite v1.48.0
 	reanimator/chart v0.0.0-00010101000000-000000000000
 )
 require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/go-analyze/bulk v0.1.3 // indirect
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
 	golang.org/x/image v0.24.0 // indirect
 	golang.org/x/sys v0.42.0 // indirect
-	modernc.org/libc v1.70.0 // indirect
+	modernc.org/libc v1.72.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
 	modernc.org/sqlite v1.48.0 // indirect
 )
--- a/audit/go.sum
+++ b/audit/go.sum
@@ -1,37 +1,51 @@
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
-github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
 github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
 github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
 github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
-golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
-golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
 golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
-modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
+modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
-modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
 modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
 modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
 modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
 modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
 modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
 modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
 modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
 modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
 modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
 modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
 modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
 modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
 modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
 modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
 modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
 modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
 modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
 modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
 modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
 modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
 modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
 modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
 modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
 modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
 modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
 modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,17 +19,22 @@ import (
 )
 var (
-	DefaultExportDir       = "/appdata/bee/export"
+	DefaultExportDir                     = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditJSONPath                 = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
+	DefaultAuditLogPath                  = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
+	DefaultWebLogPath                    = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
+	DefaultNetworkLogPath                = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
+	DefaultNvidiaLogPath                 = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultSSHLogPath                    = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeJSONPath               = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
+	DefaultRuntimeLogPath                = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
+	DefaultTechDumpDir                   = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultSATBaseDir                    = DefaultExportDir + "/bee-sat"
 	DefaultBeeBenchBaseDir               = DefaultExportDir + "/bee-bench"
 	DefaultBeeBenchAutotuneDir           = DefaultBeeBenchBaseDir + "/autotune"
 	DefaultBeeBenchPerfDir               = DefaultBeeBenchBaseDir + "/perf"
 	DefaultBeeBenchPowerDir              = DefaultBeeBenchBaseDir + "/power"
 	DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
 )
 type App struct {
@@ -83,6 +88,7 @@ type installer interface {
 	InstallToDisk(ctx context.Context, device string, logFile string) error
 	IsLiveMediaInRAM() bool
 	LiveBootSource() platform.LiveBootSource
 	LiveMediaRAMState() platform.LiveMediaRAMState
 	RunInstallToRAM(ctx context.Context, logFunc func(string)) error
 }
@@ -107,6 +113,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
 	return a.installer.LiveBootSource()
 }
 func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
 	return a.installer.LiveMediaRAMState()
 }
 func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 	return a.installer.RunInstallToRAM(ctx, logFunc)
 }
@@ -114,9 +124,19 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 type satRunner interface {
 	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
-	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
-	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	ResetNvidiaGPU(index int) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
 	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
 	DetectGPUVendor() string
@@ -129,7 +149,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }
 type runtimeChecker interface {
@@ -181,6 +201,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 	}
 	result := collector.Run(runtimeMode)
 	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
 	writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -195,10 +216,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -223,10 +241,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -292,7 +307,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	}
 	filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
 	tmpPath := filepath.Join(os.TempDir(), filename)
-	data, err := os.ReadFile(DefaultAuditJSONPath)
+	data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
 	if err != nil {
 		return "", err
 	}
@@ -520,6 +535,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return a.sat.ListNvidiaGPUs()
 }
 func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
 	return a.sat.ListNvidiaGPUStatuses()
 }
 func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
 	out, err := a.sat.ResetNvidiaGPU(index)
 	return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
 }
 func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -532,10 +556,106 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
 	return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
 }
 func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPerfDir
 	}
 	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
 	if err != nil {
 		return "", err
 	}
 	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPowerDir
 	}
 	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
 	if err != nil {
 		return "", err
 	}
 	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchAutotuneDir
 	}
 	return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
 }
 func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
 	return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
 }
 func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
 	cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
 	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
 		if logFunc != nil {
 			logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
 		}
 		return *cfg, nil
 	}
 	if logFunc != nil {
 		logFunc("benchmark autotune: no saved power source config, running autotune first")
 	}
 	autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
 	if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
 		return platform.BenchmarkPowerAutotuneConfig{}, err
 	}
 	cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
 	if err != nil {
 		return platform.BenchmarkPowerAutotuneConfig{}, err
 	}
 	return *cfg, nil
 }
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
 }
 func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
 }
 func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -544,14 +664,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
 }
 func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
 }
-func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
 }
 func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -576,14 +696,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
 }
 func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
-	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
+	return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
 }
-func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
-	return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
+	return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
 }
 func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -670,8 +790,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
 }
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
@@ -868,6 +995,41 @@ func bodyOr(body, fallback string) string {
 	return body
 }
 // writePSUStatusesToDB records PSU statuses collected during audit into the
 // component-status DB so they are visible in the Hardware Summary card.
 // PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
 func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
 	if db == nil || len(psus) == 0 {
 		return
 	}
 	const source = "audit:ipmi"
 	worstStatus := "OK"
 	for _, psu := range psus {
 		if psu.Status == nil {
 			continue
 		}
 		slot := "?"
 		if psu.Slot != nil {
 			slot = *psu.Slot
 		}
 		st := *psu.Status
 		detail := ""
 		if psu.ErrorDescription != nil {
 			detail = *psu.ErrorDescription
 		}
 		db.Record("psu:"+slot, source, st, detail)
 		switch st {
 		case "Critical":
 			worstStatus = "Critical"
 		case "Warning":
 			if worstStatus != "Critical" {
 				worstStatus = "Warning"
 			}
 		}
 	}
 	db.Record("psu:all", source, worstStatus, "")
 }
 func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
@@ -886,6 +1048,12 @@ func latestSATSummaries() []string {
 		prefix string
 	}{
 		{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
 		{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
 		{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
 		{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
 		{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
 		{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
 		{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
 		{label: "Memory SAT", prefix: "memory-"},
 		{label: "Storage SAT", prefix: "storage-"},
 		{label: "CPU SAT", prefix: "cpu-"},
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -9,6 +9,7 @@ import (
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"bee/audit/internal/platform"
@@ -120,15 +121,26 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }
 type fakeSAT struct {
-	runNvidiaFn       func(string) (string, error)
+	runNvidiaFn               func(string) (string, error)
-	runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
+	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
-	runMemoryFn       func(string) (string, error)
+	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
-	runStorageFn      func(string) (string, error)
+	runNvidiaAutotuneFn       func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
-	runCPUFn          func(string, int) (string, error)
+	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
-	detectVendorFn    func() string
+	runNvidiaComputeFn        func(string, int, []int) (string, error)
-	listAMDGPUsFn     func() ([]platform.AMDGPUInfo, error)
+	runNvidiaPowerFn          func(string, int, []int) (string, error)
-	runAMDPackFn      func(string) (string, error)
+	runNvidiaPulseFn          func(string, int, []int) (string, error)
-	listNvidiaGPUsFn  func() ([]platform.NvidiaGPU, error)
+	runNvidiaBandwidthFn      func(string, []int) (string, error)
 	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
 	runCPUFn                  func(string, int) (string, error)
 	detectVendorFn            func() string
 	listAMDGPUsFn             func() ([]platform.AMDGPUInfo, error)
 	runAMDPackFn              func(string) (string, error)
 	listNvidiaGPUsFn          func() ([]platform.NvidiaGPU, error)
 	listNvidiaGPUStatusesFn   func() ([]platform.NvidiaGPUStatus, error)
 	resetNvidiaGPUFn          func(int) (string, error)
 }
 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +151,62 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
 	if f.runNvidiaBenchmarkFn != nil {
 		return f.runNvidiaBenchmarkFn(baseDir, opts)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
 	if f.runNvidiaPowerBenchFn != nil {
 		return f.runNvidiaPowerBenchFn(baseDir, opts)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
 	if f.runNvidiaAutotuneFn != nil {
 		return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
 	if f.runNvidiaComputeFn != nil {
 		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaPowerFn != nil {
 		return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaPulseFn != nil {
 		return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaBandwidthFn != nil {
 		return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
 	if f.runNvidiaStressFn != nil {
 		return f.runNvidiaStressFn(baseDir, opts)
@@ -153,11 +221,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	return nil, nil
 }
-func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
 	if f.listNvidiaGPUStatusesFn != nil {
 		return f.listNvidiaGPUStatusesFn()
 	}
 	return nil, nil
 }
 func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
 	if f.resetNvidiaGPUFn != nil {
 		return f.resetNvidiaGPUFn(index)
 	}
 	return "", nil
 }
 func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
 	return f.runMemoryFn(baseDir)
 }
-func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
 	return f.runStorageFn(baseDir)
 }
@@ -215,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }
-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNCCLFn != nil {
 		return f.runNCCLFn(baseDir, gpuIndices)
 	}
 	return "", nil
 }
 func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
 	t.Parallel()
 	var gotBaseDir string
 	var gotGPUIndices []int
 	a := &App{
 		sat: fakeSAT{
 			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
 				gotBaseDir = baseDir
 				gotGPUIndices = append([]int(nil), gpuIndices...)
 				return "/tmp/nccl-tests.tar.gz", nil
 			},
 		},
 	}
 	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
 	if err != nil {
 		t.Fatalf("RunNCCLTests error: %v", err)
 	}
 	if path != "/tmp/nccl-tests.tar.gz" {
 		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
 	}
 	if gotBaseDir != "/tmp/sat" {
 		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
 	}
 	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
 		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
 	}
 }
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
@@ -478,8 +593,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
 }
 func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 	t.Parallel()
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -516,8 +629,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 }
 func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
 	t.Parallel()
 	tmp := t.TempDir()
 	oldExportDir := DefaultExportDir
 	DefaultExportDir = tmp
@@ -579,8 +690,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 }
 func TestRunSATDefaultsToExportDir(t *testing.T) {
 	t.Parallel()
 	oldSATBaseDir := DefaultSATBaseDir
 	DefaultSATBaseDir = "/tmp/export/bee-sat"
 	t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
@@ -709,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -736,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tr := tar.NewReader(gzr)
 	var names []string
 	var auditJSON string
 	var manifest string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -752,11 +868,21 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			}
 			auditJSON = string(body)
 		}
 		if strings.HasSuffix(hdr.Name, "/manifest.txt") {
 			body, err := io.ReadAll(tr)
 			if err != nil {
 				t.Fatalf("read manifest entry: %v", err)
 			}
 			manifest = string(body)
 		}
 	}
 	for _, want := range []string{
 		"/system/ip-link.txt",
 		"/system/ip-link-stats.txt",
 		"/system/kernel-aer-nvidia.txt",
 		"/system/lspci-nvidia-bridges-vv.txt",
 		"/system/pcie-aer-sysfs.txt",
 		"/system/ethtool-info.txt",
 		"/system/ethtool-link.txt",
 		"/system/ethtool-module.txt",
@@ -792,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
 		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
 	}
 	if !contains(manifest, "files:") {
 		t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
 	}
 	if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
 		t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
 	}
 }
 func TestMainBanner(t *testing.T) {
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -0,0 +1,67 @@
 package app
 import (
 	"fmt"
 	"io"
 	"os"
 	"path/filepath"
 )
 // readFileLimited reads path into memory, refusing files larger than maxBytes.
 // Prevents OOM on corrupted or unexpectedly large data files.
 func readFileLimited(path string, maxBytes int64) ([]byte, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
 	if err != nil {
 		return nil, err
 	}
 	if int64(len(data)) > maxBytes {
 		return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
 	}
 	return data, nil
 }
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
 	}
 	tmpPath := path + ".tmp"
 	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
 	if err != nil {
 		return fmt.Errorf("open temp %s: %w", tmpPath, err)
 	}
 	success := false
 	defer func() {
 		_ = f.Close()
 		if !success {
 			_ = os.Remove(tmpPath)
 		}
 	}()
 	if _, err := f.Write(data); err != nil {
 		return fmt.Errorf("write temp %s: %w", tmpPath, err)
 	}
 	if err := f.Sync(); err != nil {
 		return fmt.Errorf("sync temp %s: %w", tmpPath, err)
 	}
 	if err := f.Close(); err != nil {
 		return fmt.Errorf("close temp %s: %w", tmpPath, err)
 	}
 	if err := os.Rename(tmpPath, path); err != nil {
 		return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
 	}
 	if dir, err := os.Open(filepath.Dir(path)); err == nil {
 		_ = dir.Sync()
 		_ = dir.Close()
 	}
 	success = true
 	return nil
 }
--- a/audit/internal/app/atomic_write_test.go
+++ b/audit/internal/app/atomic_write_test.go
@@ -0,0 +1,71 @@
 package app
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"testing"
 	"bee/audit/internal/schema"
 )
 func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "bee-audit.json")
 	if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
 		t.Fatalf("seed file: %v", err)
 	}
 	if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
 		t.Fatalf("atomicWriteFile: %v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("read final: %v", err)
 	}
 	if string(raw) != "new\n" {
 		t.Fatalf("final content=%q want %q", string(raw), "new\n")
 	}
 	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
 		t.Fatalf("tmp file should be absent after success, err=%v", err)
 	}
 }
 func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "runtime-health.json")
 	a := &App{
 		runtime: fakeRuntime{
 			collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
 				return schema.RuntimeHealth{
 					Status:      "OK",
 					ExportDir:   exportDir,
 					DriverReady: true,
 					CUDAReady:   true,
 				}, nil
 			},
 		},
 	}
 	got, err := a.RunRuntimePreflight("file:" + path)
 	if err != nil {
 		t.Fatalf("RunRuntimePreflight: %v", err)
 	}
 	if got != path {
 		t.Fatalf("path=%q want %q", got, path)
 	}
 	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
 		t.Fatalf("tmp file should be absent after success, err=%v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("read runtime file: %v", err)
 	}
 	var health schema.RuntimeHealth
 	if err := json.Unmarshal(raw, &health); err != nil {
 		t.Fatalf("json unmarshal: %v", err)
 	}
 	if health.Status != "OK" {
 		t.Fatalf("status=%q want OK", health.Status)
 	}
 }
--- a/audit/internal/app/blackbox.go
+++ b/audit/internal/app/blackbox.go
@@ -0,0 +1,779 @@
 package app
 import (
 	"bytes"
 	"context"
 	"crypto/rand"
 	"encoding/hex"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io/fs"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"sort"
 	"strings"
 	"sync"
 	"time"
 	"bee/audit/internal/platform"
 )
 const (
 	blackboxMarkerName        = ".bee-blackbox"
 	blackboxDiscoverInterval  = 2 * time.Second
 	blackboxMinFlushPeriod    = 1 * time.Second
 	blackboxMaxFlushPeriod    = 30 * time.Second
 	blackboxRecoveryFastCount = 5
 )
 var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
 var (
 	blackboxExecCommand = exec.Command
 	blackboxNow         = func() time.Time { return time.Now().UTC() }
 )
 type BlackboxMarker struct {
 	Version      int    `json:"version"`
 	EnrollmentID string `json:"enrollment_id"`
 	CreatedAtUTC string `json:"created_at_utc"`
 	Host         string `json:"host,omitempty"`
 }
 type BlackboxTargetStatus struct {
 	EnrollmentID      string                   `json:"enrollment_id"`
 	Device            string                   `json:"device"`
 	FS                platform.RemovableTarget `json:"fs"`
 	BootFolder        string                   `json:"boot_folder"`
 	Status            string                   `json:"status"`
 	LastSyncAtUTC     string                   `json:"last_sync_at_utc,omitempty"`
 	LastCycleDuration string                   `json:"last_cycle_duration,omitempty"`
 	FlushPeriod       string                   `json:"flush_period"`
 	LastError         string                   `json:"last_error,omitempty"`
 	Mountpoint        string                   `json:"mountpoint,omitempty"`
 }
 type BlackboxState struct {
 	Status           string                 `json:"status"`
 	BootStartedAtUTC string                 `json:"boot_started_at_utc"`
 	BootFolder       string                 `json:"boot_folder"`
 	UpdatedAtUTC     string                 `json:"updated_at_utc"`
 	Targets          []BlackboxTargetStatus `json:"targets"`
 }
 type blackboxRuntime struct {
 	exportDir   string
 	statePath   string
 	system      *platform.System
 	bootStarted time.Time
 	bootFolder  string
 	mu      sync.Mutex
 	workers map[string]*blackboxWorker
 }
 type discoveredBlackboxTarget struct {
 	marker       BlackboxMarker
 	target       platform.RemovableTarget
 	seenMount    string
 	mountedByBee bool
 }
 type blackboxWorker struct {
 	runtime      *blackboxRuntime
 	enrollmentID string
 	mu           sync.Mutex
 	target       platform.RemovableTarget
 	marker       BlackboxMarker
 	mountpoint   string
 	mountedByBee bool
 	status       string
 	lastSyncAt   time.Time
 	lastDuration time.Duration
 	flushPeriod  time.Duration
 	lastError    string
 	fastCycles   int
 	stopCh       chan struct{}
 	stoppedCh    chan struct{}
 }
 func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
 	exportDir = strings.TrimSpace(exportDir)
 	if exportDir == "" {
 		exportDir = DefaultExportDir
 	}
 	statePath = strings.TrimSpace(statePath)
 	if statePath == "" {
 		statePath = DefaultBlackboxStatePath
 	}
 	if system == nil {
 		system = platform.New()
 	}
 	bootStarted, err := bootStartedAtUTC()
 	if err != nil {
 		bootStarted = blackboxNow()
 	}
 	rt := &blackboxRuntime{
 		exportDir:   exportDir,
 		statePath:   statePath,
 		system:      system,
 		bootStarted: bootStarted,
 		bootFolder:  SupportBundleBaseName(bootStarted),
 		workers:     make(map[string]*blackboxWorker),
 	}
 	_ = os.MkdirAll(filepath.Dir(statePath), 0755)
 	rt.persistState()
 	ticker := time.NewTicker(blackboxDiscoverInterval)
 	defer ticker.Stop()
 	for {
 		rt.reconcile()
 		select {
 		case <-ctx.Done():
 			rt.stopAll()
 			return ctx.Err()
 		case <-ticker.C:
 		}
 	}
 }
 func ReadBlackboxState(path string) (BlackboxState, error) {
 	path = strings.TrimSpace(path)
 	if path == "" {
 		path = DefaultBlackboxStatePath
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		return BlackboxState{}, err
 	}
 	var state BlackboxState
 	if err := json.Unmarshal(raw, &state); err != nil {
 		return BlackboxState{}, err
 	}
 	return state, nil
 }
 func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
 	target = sanitizeRemovableTarget(target)
 	if target.Device == "" {
 		return BlackboxMarker{}, fmt.Errorf("device is required")
 	}
 	mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
 	if err != nil {
 		return BlackboxMarker{}, err
 	}
 	defer func() {
 		if mountedByBee {
 			_ = unmountTarget(mountpoint)
 		}
 	}()
 	marker, _, err := readBlackboxMarker(mountpoint)
 	if err != nil && !errors.Is(err, os.ErrNotExist) {
 		return BlackboxMarker{}, err
 	}
 	if marker.EnrollmentID == "" {
 		marker = BlackboxMarker{
 			Version:      1,
 			EnrollmentID: newBlackboxEnrollmentID(),
 			CreatedAtUTC: blackboxNow().Format(time.RFC3339),
 			Host:         hostnameOr("unknown"),
 		}
 	}
 	if err := writeBlackboxMarker(mountpoint, marker); err != nil {
 		return BlackboxMarker{}, err
 	}
 	return marker, nil
 }
 func DisableBlackboxTarget(device, enrollmentID string) error {
 	device = strings.TrimSpace(device)
 	enrollmentID = strings.TrimSpace(enrollmentID)
 	if device == "" && enrollmentID == "" {
 		return fmt.Errorf("device or enrollment_id is required")
 	}
 	system := platform.New()
 	targets, err := system.ListRemovableTargets()
 	if err != nil {
 		return err
 	}
 	for _, target := range targets {
 		target = sanitizeRemovableTarget(target)
 		mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
 		if mountErr != nil {
 			continue
 		}
 		remove := false
 		marker, _, err := readBlackboxMarker(mountpoint)
 		if err == nil {
 			if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
 				remove = true
 			}
 			if device != "" && target.Device == device {
 				remove = true
 			}
 		}
 		if remove {
 			err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
 		}
 		if mountedByBee {
 			_ = unmountTarget(mountpoint)
 		}
 		if remove {
 			return err
 		}
 	}
 	return os.ErrNotExist
 }
 func (rt *blackboxRuntime) reconcile() {
 	discovered, _ := rt.discoverMarkedTargets()
 	rt.mu.Lock()
 	defer rt.mu.Unlock()
 	seen := make(map[string]struct{}, len(discovered))
 	for _, found := range discovered {
 		seen[found.marker.EnrollmentID] = struct{}{}
 		worker, ok := rt.workers[found.marker.EnrollmentID]
 		if !ok {
 			worker = newBlackboxWorker(rt, found)
 			rt.workers[found.marker.EnrollmentID] = worker
 			go worker.run()
 			continue
 		}
 		worker.update(found)
 	}
 	for id, worker := range rt.workers {
 		if _, ok := seen[id]; ok {
 			continue
 		}
 		worker.stop()
 		delete(rt.workers, id)
 	}
 	rt.persistStateLocked()
 }
 func (rt *blackboxRuntime) stopAll() {
 	rt.mu.Lock()
 	workers := make([]*blackboxWorker, 0, len(rt.workers))
 	for _, worker := range rt.workers {
 		workers = append(workers, worker)
 	}
 	rt.workers = map[string]*blackboxWorker{}
 	rt.persistStateLocked()
 	rt.mu.Unlock()
 	for _, worker := range workers {
 		worker.stop()
 	}
 }
 func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
 	targets, err := rt.system.ListRemovableTargets()
 	if err != nil {
 		return nil, err
 	}
 	var out []discoveredBlackboxTarget
 	for _, rawTarget := range targets {
 		target := sanitizeRemovableTarget(rawTarget)
 		if target.Device == "" {
 			continue
 		}
 		mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
 		if err != nil {
 			continue
 		}
 		marker, ok, err := readBlackboxMarker(mountpoint)
 		if mountedByBee && !ok {
 			_ = unmountTarget(mountpoint)
 		}
 		if err != nil || !ok || marker.EnrollmentID == "" {
 			continue
 		}
 		if mountedByBee {
 			_ = unmountTarget(mountpoint)
 		}
 		out = append(out, discoveredBlackboxTarget{
 			marker:       marker,
 			target:       target,
 			seenMount:    mountpoint,
 			mountedByBee: mountedByBee,
 		})
 	}
 	sort.Slice(out, func(i, j int) bool {
 		return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
 	})
 	return out, nil
 }
 func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
 	return &blackboxWorker{
 		runtime:      rt,
 		enrollmentID: found.marker.EnrollmentID,
 		target:       found.target,
 		marker:       found.marker,
 		flushPeriod:  blackboxMinFlushPeriod,
 		status:       "running",
 		stopCh:       make(chan struct{}),
 		stoppedCh:    make(chan struct{}),
 	}
 }
 func (w *blackboxWorker) run() {
 	defer close(w.stoppedCh)
 	for {
 		start := time.Now()
 		err := w.syncCycle()
 		duration := time.Since(start)
 		w.finishCycle(duration, err)
 		wait := w.currentFlushPeriod()
 		timer := time.NewTimer(wait)
 		select {
 		case <-w.stopCh:
 			timer.Stop()
 			w.cleanup()
 			return
 		case <-timer.C:
 		}
 	}
 }
 func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	w.target = found.target
 	w.marker = found.marker
 }
 func (w *blackboxWorker) stop() {
 	select {
 	case <-w.stopCh:
 	default:
 		close(w.stopCh)
 	}
 	<-w.stoppedCh
 }
 func (w *blackboxWorker) currentFlushPeriod() time.Duration {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	return w.flushPeriod
 }
 func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	w.lastDuration = duration
 	if err != nil {
 		w.status = "degraded"
 		w.lastError = err.Error()
 		w.fastCycles = 0
 		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
 	} else {
 		w.status = "running"
 		w.lastSyncAt = blackboxNow()
 		w.lastError = ""
 		if duration <= w.flushPeriod/2 {
 			w.fastCycles++
 		} else {
 			w.fastCycles = 0
 		}
 		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
 	}
 	w.runtime.persistState()
 }
 func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
 	if current <= 0 {
 		current = blackboxMinFlushPeriod
 	}
 	if duration <= 0 {
 		duration = current
 	}
 	next := current
 	if duration > current {
 		growA := time.Duration(float64(current) * 1.25)
 		growB := time.Duration(float64(duration) * 1.25)
 		if growB > growA {
 			next = growB
 		} else {
 			next = growA
 		}
 	}
 	if success && fastCycles >= blackboxRecoveryFastCount {
 		next = time.Duration(float64(current) * 0.9)
 	}
 	if next < blackboxMinFlushPeriod {
 		next = blackboxMinFlushPeriod
 	}
 	if next > blackboxMaxFlushPeriod {
 		next = blackboxMaxFlushPeriod
 	}
 	return next
 }
 func (w *blackboxWorker) syncCycle() error {
 	target, marker := w.snapshotTarget()
 	mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
 	if err != nil {
 		return err
 	}
 	w.recordMountpoint(mountpoint, mountedByBee)
 	root := filepath.Join(mountpoint, w.runtime.bootFolder)
 	if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
 		return err
 	}
 	if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
 		return err
 	}
 	if err := w.captureSnapshots(root); err != nil {
 		return err
 	}
 	return syncFilesystem(root)
 }
 func (w *blackboxWorker) cleanup() {
 	w.mu.Lock()
 	mountpoint := w.mountpoint
 	mountedByBee := w.mountedByBee
 	w.mu.Unlock()
 	if mountedByBee && mountpoint != "" {
 		_ = unmountTarget(mountpoint)
 	}
 }
 func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	return w.target, w.marker
 }
 func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	w.mountpoint = mountpoint
 	w.mountedByBee = mountedByBee
 }
 func (w *blackboxWorker) captureSnapshots(root string) error {
 	if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
 		return err
 	}
 	for _, svc := range supportBundleServices {
 		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
 			return err
 		}
 		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
 			return err
 		}
 	}
 	if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
 		return err
 	}
 	for _, item := range supportBundleOptionalFiles {
 		if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
 		}
 	}
 	return nil
 }
 func (rt *blackboxRuntime) persistState() {
 	rt.mu.Lock()
 	defer rt.mu.Unlock()
 	rt.persistStateLocked()
 }
 func (rt *blackboxRuntime) persistStateLocked() {
 	state := BlackboxState{
 		Status:           "disabled",
 		BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
 		BootFolder:       rt.bootFolder,
 		UpdatedAtUTC:     blackboxNow().Format(time.RFC3339),
 		Targets:          make([]BlackboxTargetStatus, 0, len(rt.workers)),
 	}
 	if len(rt.workers) > 0 {
 		state.Status = "running"
 	}
 	for _, worker := range rt.workers {
 		worker.mu.Lock()
 		targetState := BlackboxTargetStatus{
 			EnrollmentID: worker.enrollmentID,
 			Device:       worker.target.Device,
 			FS:           worker.target,
 			BootFolder:   rt.bootFolder,
 			Status:       worker.status,
 			FlushPeriod:  worker.flushPeriod.String(),
 			LastError:    worker.lastError,
 			Mountpoint:   worker.mountpoint,
 		}
 		if !worker.lastSyncAt.IsZero() {
 			targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
 		}
 		if worker.lastDuration > 0 {
 			targetState.LastCycleDuration = worker.lastDuration.String()
 		}
 		if worker.status == "degraded" {
 			state.Status = "degraded"
 		}
 		worker.mu.Unlock()
 		state.Targets = append(state.Targets, targetState)
 	}
 	sort.Slice(state.Targets, func(i, j int) bool {
 		return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
 	})
 	_ = writeJSONAtomic(rt.statePath, state)
 }
 func bootStartedAtUTC() (time.Time, error) {
 	raw, err := os.ReadFile("/proc/stat")
 	if err != nil {
 		return time.Time{}, err
 	}
 	for _, line := range strings.Split(string(raw), "\n") {
 		line = strings.TrimSpace(line)
 		if !strings.HasPrefix(line, "btime ") {
 			continue
 		}
 		parts := strings.Fields(line)
 		if len(parts) != 2 {
 			break
 		}
 		sec, err := time.ParseDuration(parts[1] + "s")
 		if err != nil {
 			break
 		}
 		return time.Unix(int64(sec/time.Second), 0).UTC(), nil
 	}
 	return time.Time{}, fmt.Errorf("boot time not found")
 }
 func newBlackboxEnrollmentID() string {
 	var buf [8]byte
 	if _, err := rand.Read(buf[:]); err != nil {
 		return fmt.Sprintf("bb-%d", time.Now().UnixNano())
 	}
 	return "bb-" + hex.EncodeToString(buf[:])
 }
 func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
 	target.Device = strings.TrimSpace(target.Device)
 	target.FSType = strings.TrimSpace(target.FSType)
 	target.Size = strings.TrimSpace(target.Size)
 	target.Label = strings.TrimSpace(target.Label)
 	target.Model = strings.TrimSpace(target.Model)
 	target.Mountpoint = strings.TrimSpace(target.Mountpoint)
 	return target
 }
 func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
 	target = sanitizeRemovableTarget(target)
 	if target.Mountpoint != "" {
 		if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
 			return target.Mountpoint, false, nil
 		}
 	}
 	mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
 	if err := os.MkdirAll(mountpoint, 0755); err != nil {
 		return "", false, err
 	}
 	if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
 		return "", false, formatBlackboxMountTargetError(target, string(raw), err)
 	}
 	if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
 		_ = unmountTarget(mountpoint)
 		return "", false, err
 	}
 	return mountpoint, true, nil
 }
 func unmountTarget(mountpoint string) error {
 	_ = blackboxExecCommand("sync").Run()
 	raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
 	if err != nil {
 		msg := strings.TrimSpace(string(raw))
 		if msg == "" {
 			return err
 		}
 		return fmt.Errorf("%s: %w", msg, err)
 	}
 	return nil
 }
 func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
 	raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
 	if err != nil {
 		if errors.Is(err, os.ErrNotExist) {
 			return BlackboxMarker{}, false, os.ErrNotExist
 		}
 		return BlackboxMarker{}, false, err
 	}
 	var marker BlackboxMarker
 	if err := json.Unmarshal(raw, &marker); err != nil {
 		return BlackboxMarker{}, false, err
 	}
 	return marker, true, nil
 }
 func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
 	if marker.Version == 0 {
 		marker.Version = 1
 	}
 	return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
 }
 func syncDirectoryTree(srcDir, dstDir string) error {
 	seen := make(map[string]struct{})
 	err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			return err
 		}
 		rel, err := filepath.Rel(srcDir, path)
 		if err != nil {
 			return err
 		}
 		rel = filepath.Clean(rel)
 		if rel == "." {
 			seen["."] = struct{}{}
 			return os.MkdirAll(dstDir, 0755)
 		}
 		seen[rel] = struct{}{}
 		dstPath := filepath.Join(dstDir, rel)
 		if d.IsDir() {
 			info, err := d.Info()
 			if err != nil {
 				return err
 			}
 			return os.MkdirAll(dstPath, info.Mode().Perm())
 		}
 		return copyFileIfChanged(path, dstPath)
 	})
 	if err != nil {
 		return err
 	}
 	return removeMissingPaths(dstDir, seen)
 }
 func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
 	return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			return err
 		}
 		rel, err := filepath.Rel(dstDir, path)
 		if err != nil {
 			return err
 		}
 		rel = filepath.Clean(rel)
 		if rel == "." {
 			return nil
 		}
 		if _, ok := seen[rel]; ok {
 			return nil
 		}
 		return os.RemoveAll(path)
 	})
 }
 func copyFileIfChanged(src, dst string) error {
 	info, err := os.Stat(src)
 	if err != nil {
 		return err
 	}
 	if info.IsDir() {
 		return os.MkdirAll(dst, info.Mode().Perm())
 	}
 	srcData, err := os.ReadFile(src)
 	if err != nil {
 		return err
 	}
 	if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
 		return nil
 	}
 	return writeFileAtomic(dst, srcData, info.Mode().Perm())
 }
 func captureCommandAtomic(dst string, name string, args ...string) error {
 	raw, err := blackboxExecCommand(name, args...).CombinedOutput()
 	if len(raw) == 0 {
 		if err != nil {
 			raw = []byte(err.Error() + "\n")
 		} else {
 			raw = []byte("no output\n")
 		}
 	}
 	return writeFileAtomic(dst, raw, 0644)
 }
 func writeJSONAtomic(path string, v any) error {
 	raw, err := json.MarshalIndent(v, "", "  ")
 	if err != nil {
 		return err
 	}
 	raw = append(raw, '\n')
 	return writeFileAtomic(path, raw, 0644)
 }
 func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return err
 	}
 	if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
 		return nil
 	}
 	tmp := path + ".tmp"
 	f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
 	if err != nil {
 		return err
 	}
 	if _, err := f.Write(data); err != nil {
 		_ = f.Close()
 		return err
 	}
 	if err := f.Sync(); err != nil {
 		_ = f.Close()
 		return err
 	}
 	if err := f.Close(); err != nil {
 		return err
 	}
 	if err := os.Rename(tmp, path); err != nil {
 		return err
 	}
 	return syncFilesystem(filepath.Dir(path))
 }
 func syncFilesystem(path string) error {
 	return blackboxExecCommand("sync").Run()
 }
 func ensureWritableBlackboxMountpoint(mountpoint string) error {
 	probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
 	if err != nil {
 		return fmt.Errorf("target filesystem is not writable: %w", err)
 	}
 	name := probe.Name()
 	if closeErr := probe.Close(); closeErr != nil {
 		_ = os.Remove(name)
 		return closeErr
 	}
 	if err := os.Remove(name); err != nil {
 		return err
 	}
 	return nil
 }
 func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
 	msg := strings.TrimSpace(raw)
 	fstype := strings.ToLower(strings.TrimSpace(target.FSType))
 	if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
 		return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
 	}
 	if msg == "" {
 		return err
 	}
 	return fmt.Errorf("%s: %w", msg, err)
 }
--- a/audit/internal/app/blackbox_test.go
+++ b/audit/internal/app/blackbox_test.go
@@ -0,0 +1,52 @@
 package app
 import (
 	"path/filepath"
 	"testing"
 	"time"
 )
 func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
 	current := 2 * time.Second
 	got := adjustFlushPeriod(current, 4*time.Second, false, 0)
 	if got <= current {
 		t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
 	}
 }
 func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
 	current := 10 * time.Second
 	got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
 	if got >= current {
 		t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
 	}
 	if got < blackboxMinFlushPeriod {
 		t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
 	}
 }
 func TestReadBlackboxState(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "blackbox-state.json")
 	want := BlackboxState{
 		Status:           "running",
 		BootStartedAtUTC: "2026-04-24T00:00:00Z",
 		BootFolder:       "boot-folder",
 		UpdatedAtUTC:     "2026-04-24T00:00:01Z",
 		Targets: []BlackboxTargetStatus{{
 			EnrollmentID: "bb-1",
 			Device:       "/dev/sdb1",
 			Status:       "running",
 			FlushPeriod:  "1s",
 		}},
 	}
 	if err := writeJSONAtomic(path, want); err != nil {
 		t.Fatalf("writeJSONAtomic: %v", err)
 	}
 	got, err := ReadBlackboxState(path)
 	if err != nil {
 		t.Fatalf("ReadBlackboxState: %v", err)
 	}
 	if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
 		t.Fatalf("state=%+v", got)
 	}
 }
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -21,12 +21,12 @@ type ComponentStatusDB struct {
 // ComponentStatusRecord holds the current and historical health of one hardware component.
 type ComponentStatusRecord struct {
-	ComponentKey  string                  `json:"component_key"`
+	ComponentKey  string                 `json:"component_key"`
-	Status        string                  `json:"status"` // "OK", "Warning", "Critical", "Unknown"
+	Status        string                 `json:"status"` // "OK", "Warning", "Critical", "Unknown"
-	LastCheckedAt time.Time               `json:"last_checked_at"`
+	LastCheckedAt time.Time              `json:"last_checked_at"`
-	LastChangedAt time.Time               `json:"last_changed_at"`
+	LastChangedAt time.Time              `json:"last_changed_at"`
-	ErrorSummary  string                  `json:"error_summary,omitempty"`
+	ErrorSummary  string                 `json:"error_summary,omitempty"`
-	History       []ComponentStatusEntry  `json:"history"`
+	History       []ComponentStatusEntry `json:"history"`
 }
 // ComponentStatusEntry is one observation written to a component's history.
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
-	data, err := os.ReadFile(path)
+	data, err := readFileLimited(path, 10<<20)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
 	// Map SAT target to component keys.
 	switch target {
-	case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
+	case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
 		"amd-stress", "amd-mem", "amd-bandwidth":
 		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
 	case "memory", "memory-stress", "sat-stress":
 		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -3,6 +3,7 @@ package app
 import (
 	"os"
 	"path/filepath"
 	"strconv"
 	"sort"
 	"strings"
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
 		applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
 		applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
 	}
 	if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
 		applyMemorySAT(snap.Memory, summary)
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
 	applyComponentStatusDB(snap, db)
 }
 type nvidiaPerGPUStatus struct {
 	runStatus string
 	reason    string
 }
 func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
 	statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
 	if !ok {
 		return
 	}
 	for i := range devs {
 		if devs[i].Telemetry == nil {
 			continue
 		}
 		rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
 		if !ok {
 			continue
 		}
 		idx, ok := telemetryInt(rawIdx)
 		if !ok {
 			continue
 		}
 		st, ok := statusByIndex[idx]
 		if !ok {
 			continue
 		}
 		status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
 		if !ok {
 			continue
 		}
 		mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
 	}
 }
 func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
 	matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
 	if err != nil || len(matches) == 0 {
 		return nil, "", false
 	}
 	sort.Strings(matches)
 	runDir := matches[len(matches)-1]
 	summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return nil, "", false
 	}
 	summaryKV := parseKeyValueSummary(string(summaryRaw))
 	runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
 	files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
 	if err != nil || len(files) == 0 {
 		return nil, "", false
 	}
 	out := make(map[int]nvidiaPerGPUStatus, len(files))
 	for _, file := range files {
 		raw, err := os.ReadFile(file)
 		if err != nil {
 			continue
 		}
 		kv := parseKeyValueSummary(string(raw))
 		idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
 		if err != nil {
 			continue
 		}
 		out[idx] = nvidiaPerGPUStatus{
 			runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
 			reason:    strings.TrimSpace(kv["reason"]),
 		}
 	}
 	if len(out) == 0 {
 		return nil, "", false
 	}
 	return out, runAtUTC, true
 }
 func telemetryInt(v any) (int, bool) {
 	switch value := v.(type) {
 	case int:
 		return value, true
 	case int32:
 		return int(value), true
 	case int64:
 		return int(value), true
 	case float64:
 		return int(value), true
 	case string:
 		n, err := strconv.Atoi(strings.TrimSpace(value))
 		if err != nil {
 			return 0, false
 		}
 		return n, true
 	default:
 		return 0, false
 	}
 }
 type satSummary struct {
 	runAtUTC string
 	overall  string
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
 	}
 }
 func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
 	if component == nil || satStatus == "" {
 		return
 	}
 	current := strings.TrimSpace(ptrString(component.Status))
 	newSeverity := statusSeverity(satStatus)
 	currentSeverity := statusSeverity(current)
 	if current == "" || current == "Unknown" || newSeverity > currentSeverity {
 		mergeComponentStatus(component, changedAt, satStatus, description)
 		return
 	}
 	if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
 		component.Status = appStringPtr(satStatus)
 		component.ErrorDescription = appStringPtr(description)
 		if strings.TrimSpace(changedAt) != "" {
 			component.StatusChangedAt = appStringPtr(changedAt)
 			component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
 				Status:    satStatus,
 				ChangedAt: changedAt,
 				Details:   appStringPtr(description),
 			})
 		}
 	}
 }
 func statusSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
 	}
 }
 func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
 	baseDir := t.TempDir()
 	runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
 		t.Fatal(err)
 	}
 	class := "VideoController"
 	manufacturer := "NVIDIA Corporation"
 	bdf0 := "0000:4b:00.0"
 	bdf1 := "0000:4f:00.0"
 	snap := schema.HardwareSnapshot{
 		PCIeDevices: []schema.HardwarePCIeDevice{
 			{
 				DeviceClass:  &class,
 				Manufacturer: &manufacturer,
 				BDF:          &bdf0,
 				Telemetry:    map[string]any{"nvidia_gpu_index": 0},
 			},
 			{
 				DeviceClass:  &class,
 				Manufacturer: &manufacturer,
 				BDF:          &bdf1,
 				Telemetry:    map[string]any{"nvidia_gpu_index": 1},
 			},
 		},
 	}
 	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
 		t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
 	}
 	if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
 		got := "<nil>"
 		if snap.PCIeDevices[1].ErrorDescription != nil {
 			got = *snap.PCIeDevices[1].ErrorDescription
 		}
 		t.Fatalf("gpu1 error=%q want per-gpu reason", got)
 	}
 }
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -2,6 +2,7 @@ package app
 import (
 	"archive/tar"
 	"bee/audit/internal/platform"
 	"compress/gzip"
 	"fmt"
 	"io"
@@ -14,12 +15,17 @@ import (
 )
 var supportBundleServices = []string{
 	"bee-blackbox.service",
 	"bee-audit.service",
 	"bee-web.service",
 	"bee-network.service",
 	"bee-nvidia.service",
 	"bee-preflight.service",
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 	"nvidia-dcgm.service",
 	"nvidia-fabricmanager.service",
 }
 var supportBundleCommands = []struct {
@@ -38,17 +44,112 @@ var supportBundleCommands = []struct {
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
 	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
 	{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
 if command -v dmesg >/dev/null 2>&1; then
  dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
 else
  echo "dmesg not found"
 fi
 `}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
 	{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
 if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi topo -m 2>&1 || true
 else
  echo "nvidia-smi not found"
 fi
 `}},
 	{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
 if ! command -v systemctl >/dev/null 2>&1; then
  echo "systemctl not found"
  exit 0
 fi
 echo "=== unit files ==="
 systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
 echo
 echo "=== active units ==="
 systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
 echo
 echo "=== failed units ==="
 systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
 `}},
 	{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
 for candidate in \
  /usr/bin/nvidia-fabricmanager \
  /usr/bin/nv-fabricmanager \
  /usr/bin/nvidia-fabricmanagerd \
  /usr/bin/nvlsm; do
  if [ -e "$candidate" ]; then
    echo "=== $candidate ==="
    ls -l "$candidate" 2>&1 || true
    echo
  fi
 done
 if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
  echo "no fabric manager binaries found"
 fi
 `}},
 	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
 if ! command -v lspci >/dev/null 2>&1; then
  echo "lspci not found"
  exit 0
 fi
 found=0
 	for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
  found=1
  echo "=== GPU $gpu ==="
  lspci -s "$gpu" -vv 2>&1 || true
  bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
  if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
    echo
    echo "=== UPSTREAM $bridge for $gpu ==="
    lspci -s "$bridge" -vv 2>&1 || true
  fi
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no NVIDIA PCI devices found"
 fi
 `}},
 	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
 for d in /sys/bus/pci/devices/*/; do
  vendor=$(cat "$d/vendor" 2>/dev/null)
-  [ "$vendor" = "0x10de" ] || continue
+	  [ "$vendor" = "0x10de" ] || continue
-  dev=$(basename "$d")
+	  class=$(cat "$d/class" 2>/dev/null)
 	  case "$class" in
 	    0x030000|0x030200) ;;
 	    *) continue ;;
 	  esac
 	  dev=$(basename "$d")
  echo "=== $dev ==="
  for f in current_link_speed current_link_width max_link_speed max_link_width; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
 `}},
 	{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
 found=0
 for dev in /sys/bus/pci/devices/*; do
  [ -e "$dev" ] || continue
  bdf=$(basename "$dev")
  block=""
  for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
    if [ -r "$dev/$f" ]; then
      if [ -z "$block" ]; then
        block=1
        found=1
        echo "=== $bdf ==="
      fi
      printf "  %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
    fi
  done
  if [ -n "$block" ]; then
    echo
  fi
 done
 if [ "$found" -eq 0 ]; then
  echo "no PCIe AER sysfs counters found"
 fi
 `}},
 	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
@@ -135,9 +236,13 @@ var supportBundleOptionalFiles = []struct {
 }{
 	{name: "system/kern.log", src: "/var/log/kern.log"},
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
 	{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
 	{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
 	{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
 	{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
 }
-const supportBundleGlob = "bee-support-*.tar.gz"
+const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
 func BuildSupportBundle(exportDir string) (string, error) {
 	exportDir = strings.TrimSpace(exportDir)
@@ -151,9 +256,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}
-	host := sanitizeFilename(hostnameOr("unknown"))
+	now := time.Now().UTC()
-	ts := time.Now().UTC().Format("20060102-150405")
+
-	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
+	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
 		return "", err
 	}
@@ -185,13 +290,24 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}
-	archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
+	archiveName := SupportBundleBaseName(now) + ".tar.gz"
 	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
 	}
 	return archivePath, nil
 }
 func SupportBundleBaseName(at time.Time) string {
 	at = at.UTC()
 	date := at.Format("2006-01-02")
 	tod := at.Format("150405")
 	ver := bundleVersion()
 	model := serverModelForBundle()
 	sn := serverSerialForBundle()
 	return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
 }
 func LatestSupportBundlePath() (string, error) {
 	return latestSupportBundlePath(os.TempDir())
 }
@@ -315,6 +431,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
 	fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
 	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
 		fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
 		fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
 		if strings.TrimSpace(cfg.Reason) != "" {
 			fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
 		}
 	}
 	fmt.Fprintf(&body, "\nfiles:\n")
 	var files []string
@@ -342,6 +465,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	return os.WriteFile(dst, []byte(body.String()), 0644)
 }
 func bundleVersion() string {
 	v := buildVersion()
 	v = strings.TrimPrefix(v, "v")
 	v = strings.TrimPrefix(v, "V")
 	if v == "" || v == "unknown" {
 		return "0.0"
 	}
 	return v
 }
 func serverModelForBundle() string {
 	raw, err := exec.Command("dmidecode", "-t", "1").Output()
 	if err != nil {
 		return "unknown"
 	}
 	for _, line := range strings.Split(string(raw), "\n") {
 		line = strings.TrimSpace(line)
 		key, val, ok := strings.Cut(line, ": ")
 		if !ok {
 			continue
 		}
 		if strings.TrimSpace(key) == "Product Name" {
 			val = strings.TrimSpace(val)
 			if val == "" {
 				return "unknown"
 			}
 			return strings.ReplaceAll(val, " ", "_")
 		}
 	}
 	return "unknown"
 }
 func serverSerialForBundle() string {
 	raw, err := exec.Command("dmidecode", "-t", "1").Output()
 	if err != nil {
 		return "unknown"
 	}
 	for _, line := range strings.Split(string(raw), "\n") {
 		line = strings.TrimSpace(line)
 		key, val, ok := strings.Cut(line, ": ")
 		if !ok {
 			continue
 		}
 		if strings.TrimSpace(key) == "Serial Number" {
 			val = strings.TrimSpace(val)
 			if val == "" {
 				return "unknown"
 			}
 			return val
 		}
 	}
 	return "unknown"
 }
 func buildVersion() string {
 	raw, err := exec.Command("bee", "version").CombinedOutput()
 	if err != nil {
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
 	defer cancel()
 	return exec.CommandContext(ctx, name, args...).Output()
 }
 func interfaceHasCarrier(iface string) bool {
 	raw, err := readNetCarrierFile(iface)
 	if err != nil {
 		return false
 	}
 	return strings.TrimSpace(raw) == "1"
 }
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}
-		if interfaceHasCarrier(iface) {
+		if out, err := ethtoolModuleQuery(iface); err == nil {
-			if out, err := ethtoolModuleQuery(iface); err == nil {
+			if injectSFPDOMTelemetry(&devs[i], out) {
-				if injectSFPDOMTelemetry(&devs[i], out) {
+				enriched++
-					enriched++
+				continue
 					continue
 				}
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 		}
 		key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
 		val := strings.TrimSpace(trimmed[idx+1:])
 		if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
 			continue
 		}
 		switch {
 		case key == "identifier":
 			s := parseSFPIdentifier(val)
 			dev.SFPIdentifier = &s
 			t := true
 			dev.SFPPresent = &t
 			changed = true
 		case key == "connector":
 			s := parseSFPConnector(val)
 			dev.SFPConnector = &s
 			changed = true
 		case key == "vendor name":
 			s := strings.TrimSpace(val)
 			dev.SFPVendor = &s
 			changed = true
 		case key == "vendor pn":
 			s := strings.TrimSpace(val)
 			dev.SFPPartNumber = &s
 			changed = true
 		case key == "vendor sn":
 			s := strings.TrimSpace(val)
 			dev.SFPSerialNumber = &s
 			changed = true
 		case strings.Contains(key, "laser wavelength"):
 			if f, ok := firstFloat(val); ok {
 				dev.SFPWavelengthNM = &f
 				changed = true
 			}
 		case strings.Contains(key, "module temperature"):
 			if f, ok := firstFloat(val); ok {
 				dev.SFPTemperatureC = &f
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
 	return changed
 }
 // parseSFPIdentifier extracts the human-readable transceiver type from the
 // raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
 func parseSFPIdentifier(val string) string {
 	if s := extractParens(val); s != "" {
 		return s
 	}
 	return val
 }
 // parseSFPConnector extracts the connector type from the raw ethtool line,
 // e.g. "0x07 (LC)" → "LC".
 func parseSFPConnector(val string) string {
 	if s := extractParens(val); s != "" {
 		return s
 	}
 	return val
 }
 var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
 func extractParens(s string) string {
 	m := parenRe.FindStringSubmatch(s)
 	if len(m) < 2 {
 		return ""
 	}
 	return strings.TrimSpace(m[1])
 }
 func parseSFPDOM(raw string) map[string]any {
 	dev := schema.HardwarePCIeDevice{}
 	if !injectSFPDOMTelemetry(&dev, raw) {
 		return map[string]any{}
 	}
 	out := map[string]any{}
 	if dev.SFPPresent != nil {
 		out["sfp_present"] = *dev.SFPPresent
 	}
 	if dev.SFPIdentifier != nil {
 		out["sfp_identifier"] = *dev.SFPIdentifier
 	}
 	if dev.SFPConnector != nil {
 		out["sfp_connector"] = *dev.SFPConnector
 	}
 	if dev.SFPVendor != nil {
 		out["sfp_vendor"] = *dev.SFPVendor
 	}
 	if dev.SFPPartNumber != nil {
 		out["sfp_part_number"] = *dev.SFPPartNumber
 	}
 	if dev.SFPSerialNumber != nil {
 		out["sfp_serial_number"] = *dev.SFPSerialNumber
 	}
 	if dev.SFPWavelengthNM != nil {
 		out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
 	}
 	if dev.SFPTemperatureC != nil {
 		out["sfp_temperature_c"] = *dev.SFPTemperatureC
 	}
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
 	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
 	readNetCarrierFile = func(string) (string, error) { return "0", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
-	ethtoolModuleQuery = func(string) (string, error) {
+	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
 		t.Fatal("ethtool -m should not be called without carrier")
 		return "", nil
 	}
 	class := "EthernetController"
 	bdf := "0000:18:00.0"
--- a/audit/internal/collector/nvidia.go
+++ b/audit/internal/collector/nvidia.go
@@ -13,7 +13,9 @@ import (
 const nvidiaVendorID = 0x10de
 type nvidiaGPUInfo struct {
 	Index              int
 	BDF                string
 	Name               string
 	Serial             string
 	VBIOS              string
 	TemperatureC       *float64
@@ -72,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 			continue
 		}
 		if v := strings.TrimSpace(info.Name); v != "" {
 			devs[i].Model = &v
 		}
 		if v := strings.TrimSpace(info.Serial); v != "" {
 			devs[i].SerialNumber = &v
 		}
@@ -98,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
 func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
 	out, err := exec.Command(
 		"nvidia-smi",
-		"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
+		"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
@@ -122,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		if len(rec) == 0 {
 			continue
 		}
-		if len(rec) < 13 {
+		if len(rec) < 14 {
-			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
+			return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
 		}
 		bdf := normalizePCIeBDF(rec[1])
@@ -132,18 +137,20 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
 		}
 		info := nvidiaGPUInfo{
 			Index:              parseRequiredInt(rec[0]),
 			BDF:                bdf,
-			Serial:             strings.TrimSpace(rec[2]),
+			Name:               strings.TrimSpace(rec[2]),
-			VBIOS:              strings.TrimSpace(rec[3]),
+			Serial:             strings.TrimSpace(rec[3]),
-			TemperatureC:       parseMaybeFloat(rec[4]),
+			VBIOS:              strings.TrimSpace(rec[4]),
-			PowerW:             parseMaybeFloat(rec[5]),
+			TemperatureC:       parseMaybeFloat(rec[5]),
-			ECCUncorrected:     parseMaybeInt64(rec[6]),
+			PowerW:             parseMaybeFloat(rec[6]),
-			ECCCorrected:       parseMaybeInt64(rec[7]),
+			ECCUncorrected:     parseMaybeInt64(rec[7]),
-			HWSlowdown:         parseMaybeBool(rec[8]),
+			ECCCorrected:       parseMaybeInt64(rec[8]),
-			PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
+			HWSlowdown:         parseMaybeBool(rec[9]),
-			PCIeLinkGenMax:     parseMaybeInt(rec[10]),
+			PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
-			PCIeLinkWidthCur:   parseMaybeInt(rec[11]),
+			PCIeLinkGenMax:     parseMaybeInt(rec[11]),
-			PCIeLinkWidthMax:   parseMaybeInt(rec[12]),
+			PCIeLinkWidthCur:   parseMaybeInt(rec[12]),
 			PCIeLinkWidthMax:   parseMaybeInt(rec[13]),
 		}
 		result[bdf] = info
 	}
@@ -187,6 +194,14 @@ func parseMaybeInt(v string) *int {
 	return &n
 }
 func parseRequiredInt(v string) int {
 	n, err := strconv.Atoi(strings.TrimSpace(v))
 	if err != nil {
 		return 0
 	}
 	return n
 }
 func pcieLinkGenLabel(gen int) string {
 	return fmt.Sprintf("Gen%d", gen)
 }
@@ -240,6 +255,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
 }
 func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
 	if dev.Telemetry == nil {
 		dev.Telemetry = map[string]any{}
 	}
 	dev.Telemetry["nvidia_gpu_index"] = info.Index
 	if info.TemperatureC != nil {
 		dev.TemperatureC = info.TemperatureC
 	}
--- a/audit/internal/collector/nvidia_test.go
+++ b/audit/internal/collector/nvidia_test.go
@@ -6,7 +6,7 @@ import (
 )
 func TestParseNVIDIASMIQuery(t *testing.T) {
-	raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
+	raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
 	byBDF, err := parseNVIDIASMIQuery(raw)
 	if err != nil {
 		t.Fatalf("parse failed: %v", err)
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
 	if !ok {
 		t.Fatalf("gpu by normalized bdf not found")
 	}
 	if gpu.Name != "NVIDIA H100 80GB HBM3" {
 		t.Fatalf("name: got %q", gpu.Name)
 	}
 	if gpu.Serial != "GPU-SERIAL-1" {
 		t.Fatalf("serial: got %q", gpu.Serial)
 	}
@@ -86,6 +89,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
 	if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
 		t.Fatalf("firmware: got %v", out[0].Firmware)
 	}
 	if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
 		t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
 	}
 	if out[0].Status == nil || *out[0].Status != statusWarning {
 		t.Fatalf("status: got %v", out[0].Status)
 	}
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -2,6 +2,7 @@ package collector
 import (
 	"bee/audit/internal/schema"
 	"fmt"
 	"log/slog"
 	"os/exec"
 	"strconv"
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
 		}
 	}
 	// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
 	// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
 	if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
 		bmcPatterns := []string{
 			"management system chip",
 			"management controller",
 			"ibmc",
 			"idrac",
 			"ilo vga",
 			"aspeed",
 			"matrox",
 		}
 		for _, bad := range bmcPatterns {
 			if strings.Contains(d, bad) {
 				return false
 			}
 		}
 	}
 	if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
 		internalAMDPatterns := []string{
 			"dummy function",
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 	// SVendor/SDevice available but not in schema — skip
 	// Warn if PCIe link is running below its maximum negotiated speed.
 	applyPCIeLinkSpeedWarning(&dev)
 	return dev
 }
@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
 	return value, true
 }
 // applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
 // speed is below the maximum negotiated speed supported by both ends.
 func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
 	if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
 		return
 	}
 	if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
 		warn := statusWarning
 		dev.Status = &warn
 		desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
 		dev.ErrorDescription = &desc
 	}
 }
 // pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
 // Returns 0 for unrecognised values so comparisons fail safe.
 func pcieLinkSpeedRank(gen string) int {
 	switch gen {
 	case "Gen1":
 		return 1
 	case "Gen2":
 		return 2
 	case "Gen3":
 		return 3
 	case "Gen4":
 		return 4
 	case "Gen5":
 		return 5
 	case "Gen6":
 		return 6
 	default:
 		return 0
 	}
 }
 func normalizePCILinkSpeed(raw string) string {
 	raw = strings.TrimSpace(strings.ToLower(raw))
 	switch {
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -1,6 +1,7 @@
 package collector
 import (
 	"bee/audit/internal/schema"
 	"encoding/json"
 	"strings"
 	"testing"
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
 		{name: "raid", class: "RAID bus controller", want: true},
 		{name: "nvme", class: "Non-Volatile memory controller", want: true},
 		{name: "vga", class: "VGA compatible controller", want: true},
 		{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
 		{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
 		{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
 	}
@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
 		}
 	}
 }
 func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
 	ptr := func(s string) *string { return &s }
 	tests := []struct {
 		name        string
 		linkSpeed   *string
 		maxSpeed    *string
 		wantWarning bool
 		wantGenIn   string // substring expected in ErrorDescription when warning
 	}{
 		{
 			name:        "degraded Gen1 vs Gen5",
 			linkSpeed:   ptr("Gen1"),
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: true,
 			wantGenIn:   "Gen1",
 		},
 		{
 			name:        "at max Gen5",
 			linkSpeed:   ptr("Gen5"),
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: false,
 		},
 		{
 			name:        "degraded Gen4 vs Gen5",
 			linkSpeed:   ptr("Gen4"),
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: true,
 			wantGenIn:   "Gen4",
 		},
 		{
 			name:        "missing current speed — no warning",
 			linkSpeed:   nil,
 			maxSpeed:    ptr("Gen5"),
 			wantWarning: false,
 		},
 		{
 			name:        "missing max speed — no warning",
 			linkSpeed:   ptr("Gen1"),
 			maxSpeed:    nil,
 			wantWarning: false,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			dev := schema.HardwarePCIeDevice{}
 			ok := statusOK
 			dev.Status = &ok
 			dev.LinkSpeed = tt.linkSpeed
 			dev.MaxLinkSpeed = tt.maxSpeed
 			applyPCIeLinkSpeedWarning(&dev)
 			gotWarn := dev.Status != nil && *dev.Status == statusWarning
 			if gotWarn != tt.wantWarning {
 				t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
 			}
 			if tt.wantWarning {
 				if dev.ErrorDescription == nil {
 					t.Fatal("expected ErrorDescription to be set")
 				}
 				if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
 					t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
 				}
 			} else {
 				if dev.ErrorDescription != nil {
 					t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
 				}
 			}
 		})
 	}
 }
--- a/audit/internal/collector/psu.go
+++ b/audit/internal/collector/psu.go
@@ -160,11 +160,57 @@ type psuSDR struct {
 }
 var psuSlotPatterns = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
+	// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
-	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
+	// does not fire after the digit; match explicitly with underscore terminator.
-	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
-	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),                    // PSU1, PS1, ps 2
-	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),                      // PS 6, PS6
 	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),                     // PWS1
 	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
 	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),                     // Bay 1
 	// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
 	// Must be last: "power supply N" is already caught by the pattern above.
 	regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
 }
 // psuInputPowerKeywords matches AC-input power sensor names across vendors:
 //   MSI:     PSU1_POWER_IN, PSU1_PIN
 //   MLT:     PSU1_PIN
 //   xFusion: (matched via default fallback — no explicit keyword)
 //   HPE:     PS1 Input Power, PS1 Input Watts
 func isPSUInputPower(name string) bool {
 	return strings.Contains(name, "input power") ||
 		strings.Contains(name, "input watts") ||
 		strings.Contains(name, "_pin") ||
 		strings.Contains(name, " pin") ||
 		strings.Contains(name, "_power_in") ||
 		strings.Contains(name, "power_in")
 }
 // isPSUOutputPower matches DC-output power sensor names across vendors:
 //   MSI:     PSU1_POWER_OUT
 //   MLT:     PSU1_POUT
 //   xFusion: PS1 POut
 func isPSUOutputPower(name string) bool {
 	return strings.Contains(name, "output power") ||
 		strings.Contains(name, "output watts") ||
 		strings.Contains(name, "_pout") ||
 		strings.Contains(name, " pout") ||
 		strings.Contains(name, "_power_out") ||
 		strings.Contains(name, "power_out") ||
 		strings.Contains(name, "power supply bay") ||
 		strings.Contains(name, "psu bay")
 }
 // parseBoundedFloat parses a numeric value from an SDR value field and
 // validates it is within (0, max]. Returns nil for zero, negative, or
 // out-of-range values — these indicate missing/off/fault sensor readings.
 func parseBoundedFloat(raw string, max float64) *float64 {
 	v := parseFloatPtr(raw)
 	if v == nil || *v <= 0 || *v > max {
 		return nil
 	}
 	return v
 }
 func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +240,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
 		lowerName := strings.ToLower(name)
 		switch {
-		case strings.Contains(lowerName, "input power"):
+		case isPSUInputPower(lowerName):
-			entry.inputPowerW = parseFloatPtr(value)
+			entry.inputPowerW = parseBoundedFloat(value, 6000)
-		case strings.Contains(lowerName, "output power"):
+		case isPSUOutputPower(lowerName):
-			entry.outputPowerW = parseFloatPtr(value)
+			entry.outputPowerW = parseBoundedFloat(value, 6000)
 		case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
 			entry.outputPowerW = parseFloatPtr(value)
 		case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
 			entry.inputVoltage = parseFloatPtr(value)
 		case strings.Contains(lowerName, "temp"):
 			entry.temperatureC = parseFloatPtr(value)
 		case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
 			entry.healthPct = parsePercentPtr(value)
 		default:
 			// Generic PSU power reading: sensor matched a slot pattern but carries
 			// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
 			// AC input if the value looks like wattage and no better data is set yet.
 			if entry.inputPowerW == nil {
 				entry.inputPowerW = parseBoundedFloat(value, 6000)
 			}
 		}
 		out[slot] = entry
 	}
 	return out
 }
 // PSUSlotPower holds SDR power readings for one PSU slot.
 // Slot key used by PSUSlotsFromSDR is the 0-based index string,
 // matching HardwarePowerSupply.Slot in the audit schema.
 type PSUSlotPower struct {
 	InputW  *float64 `json:"input_w,omitempty"`
 	OutputW *float64 `json:"output_w,omitempty"`
 	Status  string   `json:"status,omitempty"`
 }
 // PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
 // using the same battle-tested slot patterns as the hardware audit collector.
 // Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
 // Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
 func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
 	sdr := parsePSUSDR(sdrOutput)
 	if len(sdr) == 0 {
 		return nil
 	}
 	out := make(map[string]PSUSlotPower, len(sdr))
 	for slot, entry := range sdr {
 		key := strconv.Itoa(slot - 1) // audit uses 0-based slot
 		out[key] = PSUSlotPower{
 			InputW:  entry.inputPowerW,
 			OutputW: entry.outputPowerW,
 			Status:  entry.status,
 		}
 	}
 	return out
 }
 func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
 	if len(sdr) == 0 {
 		return nil
--- a/audit/internal/collector/psu_sdr_test.go
+++ b/audit/internal/collector/psu_sdr_test.go
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 		{name: "PWS1 Status", want: 1},
 		{name: "Power Supply Bay 8", want: 8},
 		{name: "PS 6 Input Power", want: 6},
 		// MSI underscore format — \b does not fire between digit and '_'
 		{name: "PSU1_POWER_IN", want: 1},
 		{name: "PSU2_POWER_OUT", want: 2},
 		{name: "PSU4_STATUS", want: 4},
 	}
 	for _, tt := range tests {
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 	}
 }
 func TestParsePSUSDRMSIFormat(t *testing.T) {
 	t.Parallel()
 	raw := `
 PSU1_STATUS      | F1h | ok
 PSU1_POWER_OUT   | 928 Watts | ok
 PSU1_POWER_IN    | 976 Watts | ok
 PSU2_STATUS      | F2h | ok
 PSU2_POWER_OUT   | 944 Watts | ok
 PSU2_POWER_IN    | 992 Watts | ok
 `
 	got := parsePSUSDR(raw)
 	if len(got) != 2 {
 		t.Fatalf("len(got)=%d want 2", len(got))
 	}
 	if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
 		t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
 	}
 	if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
 		t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
 	}
 	if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
 		t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
 	}
 }
 func TestSynthesizePSUsFromSDR(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_power_autotune.go
+++ b/audit/internal/platform/benchmark_power_autotune.go
@@ -0,0 +1,735 @@
 package platform
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"sort"
 	"strings"
 	"time"
 )
 const (
 	benchmarkPowerAutotuneVersion         = 1
 	benchmarkPowerAutotuneIdleSec         = 60
 	benchmarkPowerAutotuneLoadSec         = 90
 	benchmarkPowerAutotuneSampleInterval  = 3
 	defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
 )
 func BenchmarkPowerSourceConfigPath(baseDir string) string {
 	baseDir = strings.TrimSpace(baseDir)
 	if baseDir == "" {
 		return defaultBenchmarkPowerSourceConfigPath
 	}
 	return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
 }
 func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		return nil, err
 	}
 	var cfg BenchmarkPowerAutotuneConfig
 	if err := json.Unmarshal(raw, &cfg); err != nil {
 		return nil, err
 	}
 	if strings.TrimSpace(cfg.SelectedSource) == "" {
 		return nil, fmt.Errorf("autotune config missing selected_source")
 	}
 	return &cfg, nil
 }
 func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
 	if strings.TrimSpace(path) == "" {
 		return fmt.Errorf("empty autotune config path")
 	}
 	if cfg.Version <= 0 {
 		cfg.Version = benchmarkPowerAutotuneVersion
 	}
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return err
 	}
 	data, err := json.MarshalIndent(cfg, "", "  ")
 	if err != nil {
 		return err
 	}
 	tmp := path + ".tmp"
 	if err := os.WriteFile(tmp, data, 0644); err != nil {
 		return err
 	}
 	return os.Rename(tmp, path)
 }
 func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
 	return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
 }
 func ResetBenchmarkPowerAutotuneConfig(path string) error {
 	if strings.TrimSpace(path) == "" {
 		return fmt.Errorf("empty autotune config path")
 	}
 	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
 		return err
 	}
 	return nil
 }
 func normalizeBenchmarkPowerSource(source string) string {
 	switch strings.TrimSpace(strings.ToLower(source)) {
 	case BenchmarkPowerSourceSDRPSUInput:
 		return BenchmarkPowerSourceSDRPSUInput
 	default:
 		return BenchmarkPowerSourceDCMI
 	}
 }
 func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
 	cfg, err := LoadSystemPowerSourceConfig(exportDir)
 	if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
 		selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
 		return SystemPowerSourceDecision{
 			Configured:      true,
 			SelectedSource:  selected,
 			EffectiveSource: selected,
 			Mode:            "autotuned",
 			Reason:          strings.TrimSpace(cfg.Reason),
 			ConfiguredAt:    cfg.UpdatedAt,
 		}
 	}
 	sources := sampleBenchmarkPowerSources()
 	if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
 		return SystemPowerSourceDecision{
 			Configured:      false,
 			EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
 			Mode:            "fallback",
 			Reason:          "autotune config not found; using temporary fallback source sdr_psu_input",
 		}
 	}
 	return SystemPowerSourceDecision{
 		Configured:      false,
 		EffectiveSource: BenchmarkPowerSourceDCMI,
 		Mode:            "fallback",
 		Reason:          "autotune config not found; using temporary fallback source dcmi",
 	}
 }
 func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
 	decision := ResolveSystemPowerDecision(exportDir)
 	if decision.EffectiveSource != "" {
 		if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
 			return value, decision, nil
 		} else if decision.Configured {
 			fallback := BenchmarkPowerSourceDCMI
 			if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
 				fallback = BenchmarkPowerSourceSDRPSUInput
 			}
 			if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
 				decision.Mode = "degraded"
 				decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
 				decision.EffectiveSource = fallback
 				return fallbackValue, decision, nil
 			}
 			decision.Mode = "degraded"
 			decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
 			return 0, decision, err
 		}
 	}
 	return 0, decision, fmt.Errorf("system power source unavailable")
 }
 func queryBenchmarkPowerSourceW(source string) (float64, error) {
 	switch normalizeBenchmarkPowerSource(source) {
 	case BenchmarkPowerSourceSDRPSUInput:
 		sdr := sampleIPMISDRPowerSensors()
 		if sdr.PSUInW > 0 {
 			return sdr.PSUInW, nil
 		}
 		return 0, fmt.Errorf("sdr psu input unavailable")
 	default:
 		return queryIPMIServerPowerW()
 	}
 }
 func sampleBenchmarkPowerSources() map[string]float64 {
 	out := map[string]float64{}
 	if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
 		out[BenchmarkPowerSourceDCMI] = w
 	}
 	if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
 		out[BenchmarkPowerSourceSDRPSUInput] = w
 	}
 	return out
 }
 func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
 	if durationSec <= 0 {
 		return 0, false
 	}
 	samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
 	if durationSec <= 0 {
 		return nil
 	}
 	stopCh := make(chan struct{})
 	doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
 	select {
 	case <-ctx.Done():
 	case <-time.After(time.Duration(durationSec) * time.Second):
 	}
 	close(stopCh)
 	return <-doneCh
 }
 func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
 	if intervalSec <= 0 {
 		intervalSec = benchmarkPowerAutotuneSampleInterval
 	}
 	ch := make(chan []float64, 1)
 	go func() {
 		defer close(ch)
 		var samples []float64
 		record := func() {
 			if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
 				samples = append(samples, w)
 			}
 		}
 		record()
 		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-stopCh:
 				ch <- samples
 				return
 			case <-ticker.C:
 				record()
 			}
 		}
 	}()
 	return ch
 }
 type benchmarkPowerAutotuneSample struct {
 	ElapsedSec     float64
 	GPUAvgUsagePct float64
 	CPUUsagePct    float64
 	GPUSumPowerW   float64
 	Sources        map[string]float64
 }
 func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
 	if durationSec <= 0 {
 		return nil
 	}
 	var out []benchmarkPowerAutotuneSample
 	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
 	start := time.Now()
 	for {
 		if ctx.Err() != nil {
 			return out
 		}
 		row := benchmarkPowerAutotuneSample{
 			ElapsedSec:  time.Since(start).Seconds(),
 			CPUUsagePct: sampleCPULoadPct(),
 			Sources:     sampleBenchmarkPowerSources(),
 		}
 		if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
 			var usageSum float64
 			for _, gpu := range gpuRows {
 				row.GPUSumPowerW += gpu.PowerW
 				usageSum += gpu.UsagePct
 			}
 			row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
 		}
 		out = append(out, row)
 		logBenchmarkPowerAutotuneSample(phase, row, logFunc)
 		if time.Now().After(deadline) {
 			return out
 		}
 		select {
 		case <-ctx.Done():
 			return out
 		case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
 		}
 	}
 }
 func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
 	if logFunc == nil {
 		return
 	}
 	var sourceParts []string
 	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
 		if value, ok := sample.Sources[source]; ok && value > 0 {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
 		} else {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
 		}
 	}
 	logFunc(fmt.Sprintf(
 		"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
 		phase,
 		sample.ElapsedSec,
 		sample.GPUAvgUsagePct,
 		sample.GPUSumPowerW,
 		sample.CPUUsagePct,
 		strings.Join(sourceParts, " "),
 	))
 }
 func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
 	if logFunc == nil || len(samples) == 0 {
 		return
 	}
 	var gpuUsage []float64
 	var cpuUsage []float64
 	var gpuPower []float64
 	sourceBuckets := map[string][]float64{}
 	for _, sample := range samples {
 		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
 		cpuUsage = append(cpuUsage, sample.CPUUsagePct)
 		gpuPower = append(gpuPower, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				sourceBuckets[source] = append(sourceBuckets[source], value)
 			}
 		}
 	}
 	var sourceParts []string
 	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
 		values := sourceBuckets[source]
 		if len(values) == 0 {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
 			continue
 		}
 		sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
 	}
 	logFunc(fmt.Sprintf(
 		"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
 		phase,
 		len(samples),
 		benchmarkMean(gpuUsage),
 		benchmarkPercentile(gpuUsage, 95),
 		benchmarkMean(gpuPower),
 		benchmarkMean(cpuUsage),
 		benchmarkPercentile(cpuUsage, 95),
 		strings.Join(sourceParts, " "),
 	))
 }
 func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
 	if logFunc == nil {
 		return
 	}
 	for _, candidate := range candidates {
 		if !candidate.Available {
 			logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
 			continue
 		}
 		logFunc(fmt.Sprintf(
 			"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
 			candidate.Source,
 			candidate.IdleAvgW,
 			candidate.LoadAvgW,
 			candidate.DeltaW,
 			gpuDelta,
 			candidate.RelativeError,
 			candidate.Confidence*100,
 			map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
 		))
 		if strings.TrimSpace(candidate.SelectionNotes) != "" {
 			logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
 		}
 	}
 }
 func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
 	result := &BenchmarkPowerAutotuneValidation{}
 	if len(samples) == 0 {
 		result.Reason = "no idle telemetry samples collected"
 		return result
 	}
 	var gpuUsage []float64
 	var cpuUsage []float64
 	for _, sample := range samples {
 		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
 		if sample.CPUUsagePct > 0 {
 			cpuUsage = append(cpuUsage, sample.CPUUsagePct)
 		}
 	}
 	result.GPUSamples = len(gpuUsage)
 	result.CPUSamples = len(cpuUsage)
 	result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
 	result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
 	result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
 	result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
 	switch {
 	case result.GPUAvgUsagePct > 5:
 		result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
 	case result.GPUP95UsagePct > 10:
 		result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
 	case result.CPUAvgUsagePct > 20:
 		result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
 	case result.CPUP95UsagePct > 35:
 		result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
 	default:
 		result.Valid = true
 	}
 	return result
 }
 func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
 	idleBySource := map[string][]float64{}
 	loadBySource := map[string][]float64{}
 	var idleGPU []float64
 	var loadGPU []float64
 	for _, sample := range idle {
 		idleGPU = append(idleGPU, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				idleBySource[source] = append(idleBySource[source], value)
 			}
 		}
 	}
 	for _, sample := range load {
 		loadGPU = append(loadGPU, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				loadBySource[source] = append(loadBySource[source], value)
 			}
 		}
 	}
 	idleGPUAvg := benchmarkMean(idleGPU)
 	loadGPUAvg := benchmarkMean(loadGPU)
 	gpuDelta := loadGPUAvg - idleGPUAvg
 	if gpuDelta <= 0 {
 		gpuDelta = loadGPUAvg
 	}
 	candidates := []BenchmarkPowerAutotuneCandidate{
 		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
 		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
 	}
 	available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
 	for _, candidate := range candidates {
 		if candidate.Available && candidate.DeltaW > 0 {
 			available = append(available, candidate)
 		}
 	}
 	if len(available) == 0 {
 		return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
 	}
 	sort.Slice(available, func(i, j int) bool {
 		if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
 			if available[i].Source != available[j].Source {
 				return available[i].Source == BenchmarkPowerSourceSDRPSUInput
 			}
 		}
 		if available[i].RelativeError != available[j].RelativeError {
 			return available[i].RelativeError < available[j].RelativeError
 		}
 		return available[i].Samples > available[j].Samples
 	})
 	selected := available[0]
 	for idx := range candidates {
 		if candidates[idx].Source == selected.Source {
 			candidates[idx].Selected = true
 			candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
 		}
 	}
 	return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
 }
 func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
 	candidate := BenchmarkPowerAutotuneCandidate{
 		Source:    source,
 		Available: len(idle) > 0 && len(load) > 0,
 		Samples:   minInt(len(idle), len(load)),
 	}
 	if !candidate.Available {
 		return candidate
 	}
 	candidate.IdleAvgW = benchmarkMean(idle)
 	candidate.LoadAvgW = benchmarkMean(load)
 	candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
 	if gpuDelta > 0 {
 		candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
 		candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
 	}
 	return candidate
 }
 func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
 	fmt.Fprintf(&b, "status=%s\n", result.Status)
 	fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
 	fmt.Fprintf(&b, "profile=%s\n", result.Profile)
 	fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
 	fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
 	fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
 	if result.SelectedSource != "" {
 		fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
 	}
 	if result.IdleValidation != nil {
 		fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
 		fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
 		fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
 		fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
 		fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
 		if result.IdleValidation.Reason != "" {
 			fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
 		}
 	}
 	for _, candidate := range result.Candidates {
 		fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
 		if candidate.Available {
 			fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
 			fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
 			fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
 			fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
 		}
 	}
 	return b.String()
 }
 func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
 	var b strings.Builder
 	b.WriteString("# Bee Bench Power Source Autotune\n\n")
 	fmt.Fprintf(&b, "**Status:** %s  \n", result.Status)
 	fmt.Fprintf(&b, "**Benchmark kind:** %s  \n", result.BenchmarkKind)
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.Profile)
 	fmt.Fprintf(&b, "**Idle window:** %ds  \n", result.IdleDurationSec)
 	fmt.Fprintf(&b, "**Load window:** %ds  \n", result.LoadDurationSec)
 	fmt.Fprintf(&b, "**Sample interval:** %ds  \n", result.SampleIntervalSec)
 	if result.SelectedSource != "" {
 		fmt.Fprintf(&b, "**Selected source:** `%s`  \n", result.SelectedSource)
 	}
 	b.WriteString("\n")
 	if result.IdleValidation != nil {
 		b.WriteString("## Idle Validation\n\n")
 		fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
 		fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
 		fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
 		fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
 		fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
 		if result.IdleValidation.Reason != "" {
 			fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.Candidates) > 0 {
 		b.WriteString("## Candidates\n\n")
 		b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
 		b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
 		for _, candidate := range result.Candidates {
 			if !candidate.Available {
 				fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
 				continue
 			}
 			selected := "no"
 			if candidate.Selected {
 				selected = "yes"
 			}
 			fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
 				candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
 		}
 		b.WriteString("\n")
 	}
 	for _, note := range result.Notes {
 		fmt.Fprintf(&b, "- %s\n", note)
 	}
 	return b.String()
 }
 func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
 	allDevices := joinIndexList(gpuIndices)
 	switch strings.TrimSpace(strings.ToLower(kind)) {
 	case "power-fit", "power", "nvidia-bench-power":
 		cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
 		if err == nil {
 			return cmd, "power-fit"
 		}
 		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
 	default:
 		cmd := []string{
 			"bee-gpu-burn",
 			"--seconds", fmt.Sprintf("%d", durationSec),
 			"--devices", allDevices,
 		}
 		if sizeMB > 0 {
 			cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
 		}
 		return cmd, "performance"
 	}
 }
 func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if logFunc == nil {
 		logFunc = func(string) {}
 	}
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = "/var/log/bee-bench/autotune"
 	}
 	if err := os.MkdirAll(baseDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
 	}
 	selected, err := resolveNvidiaGPUSelection(nil, nil)
 	if err != nil {
 		return "", err
 	}
 	if len(selected) == 0 {
 		return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
 	}
 	ts := time.Now().UTC().Format("20060102-150405")
 	runDir := filepath.Join(baseDir, "autotune-"+ts)
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
 	verboseLog := filepath.Join(runDir, "verbose.log")
 	hostname, _ := os.Hostname()
 	loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
 	result := BenchmarkPowerAutotuneResult{
 		GeneratedAt:       time.Now().UTC(),
 		Hostname:          hostname,
 		ServerModel:       readServerModel(),
 		BenchmarkKind:     normalizedKind,
 		Profile:           opts.Profile,
 		Status:            "FAILED",
 		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
 		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
 		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
 	}
 	logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
 	idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
 	logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
 	result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
 	if result.IdleValidation == nil || !result.IdleValidation.Valid {
 		if result.IdleValidation != nil {
 			result.IdleValidationError = result.IdleValidation.Reason
 			logFunc(result.IdleValidation.Reason)
 		}
 		result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, fmt.Errorf("%s", result.IdleValidationError)
 	}
 	logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
 	loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
 	go func() {
 		loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
 	}()
 	out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
 	loadSamples := <-loadSamplesCh
 	logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
 	if runErr != nil {
 		result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, fmt.Errorf("autotune load stage: %w", runErr)
 	}
 	selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
 	result.Candidates = candidates
 	result.GPUPowerIdleW = idleGPUAvg
 	result.GPUPowerLoadW = loadGPUAvg
 	if chooseErr != nil {
 		result.Notes = append(result.Notes, chooseErr.Error())
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, chooseErr
 	}
 	gpuDelta := loadGPUAvg - idleGPUAvg
 	if gpuDelta <= 0 {
 		gpuDelta = loadGPUAvg
 	}
 	logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
 	result.SelectedSource = selectedSource
 	result.Status = "OK"
 	var confidence float64
 	selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
 	for _, candidate := range candidates {
 		if candidate.Selected {
 			confidence = candidate.Confidence
 			if strings.TrimSpace(candidate.SelectionNotes) != "" {
 				selectionReason = candidate.SelectionNotes
 			}
 			break
 		}
 	}
 	cfg := BenchmarkPowerAutotuneConfig{
 		Version:           benchmarkPowerAutotuneVersion,
 		UpdatedAt:         time.Now().UTC(),
 		SelectedSource:    selectedSource,
 		BenchmarkKind:     normalizedKind,
 		Profile:           opts.Profile,
 		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
 		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
 		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
 		Confidence:        confidence,
 		Reason:            selectionReason,
 	}
 	result.Config = &cfg
 	configPath := BenchmarkPowerSourceConfigPath(baseDir)
 	if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
 		result.Status = "FAILED"
 		result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
 		if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
 			return "", writeErr
 		}
 		return runDir, err
 	}
 	logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
 	result.Notes = append(result.Notes, "saved autotune config to "+configPath)
 	if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 		return "", err
 	}
 	return runDir, nil
 }
 func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return fmt.Errorf("marshal autotune result: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
 		return fmt.Errorf("write autotune result.json: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
 		return fmt.Errorf("write autotune summary.txt: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
 		return fmt.Errorf("write autotune report.md: %w", err)
 	}
 	return nil
 }
 func minInt(a, b int) int {
 	if a < b {
 		return a
 	}
 	return b
 }
 var _ = exec.ErrNotFound
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,558 @@
 package platform
 import (
 	"fmt"
 	"strings"
 	"time"
 )
 func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
 	return renderBenchmarkReportWithCharts(result)
 }
 func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	// ── Header ────────────────────────────────────────────────────────────────
 	b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
 	// System identity block
 	if result.ServerModel != "" {
 		fmt.Fprintf(&b, "**Server:** %s  \n", result.ServerModel)
 	}
 	if result.Hostname != "" {
 		fmt.Fprintf(&b, "**Host:** %s  \n", result.Hostname)
 	}
 	// GPU models summary
 	if len(result.GPUs) > 0 {
 		modelCount := make(map[string]int)
 		var modelOrder []string
 		for _, g := range result.GPUs {
 			m := strings.TrimSpace(g.Name)
 			if m == "" {
 				m = "Unknown GPU"
 			}
 			if modelCount[m] == 0 {
 				modelOrder = append(modelOrder, m)
 			}
 			modelCount[m]++
 		}
 		var parts []string
 		for _, m := range modelOrder {
 			if modelCount[m] == 1 {
 				parts = append(parts, m)
 			} else {
 				parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
 			}
 		}
 		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
 	}
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	if result.RampStep > 0 && result.RampTotal > 0 {
 		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
 		if result.RampRunID != "" {
 			fmt.Fprintf(&b, "**Ramp-up run ID:** %s  \n", result.RampRunID)
 		}
 	} else if result.ParallelGPUs {
 		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
 	}
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
 	if result.PlatformPowerScore > 0 {
 		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
 	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")
 	// ── Executive Summary ─────────────────────────────────────────────────────
 	if len(result.Findings) > 0 {
 		b.WriteString("## Executive Summary\n\n")
 		for _, finding := range result.Findings {
 			fmt.Fprintf(&b, "- %s\n", finding)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.Warnings) > 0 {
 		b.WriteString("## Warnings\n\n")
 		for _, warning := range result.Warnings {
 			fmt.Fprintf(&b, "- %s\n", warning)
 		}
 		b.WriteString("\n")
 	}
 	// ── Balanced Scorecard ────────────────────────────────────────────────────
 	b.WriteString("## Balanced Scorecard\n\n")
 	// Perspective 1: Compatibility — hard stops
 	b.WriteString("### 1. Compatibility\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			thermalThrottle := "-"
 			if gpu.Scores.ThermalThrottlePct > 0 {
 				thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
 			}
 			fanAtThrottle := "-"
 			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
 				fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
 			}
 			ecc := "-"
 			if gpu.ECC.Uncorrected > 0 {
 				ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
 			}
 			compatStatus := "✓ OK"
 			if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
 				compatStatus = "⛔ HARD STOP"
 			}
 			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
 		b.WriteString("\n")
 	}
 	// Perspective 2: Thermal headroom
 	b.WriteString("### 2. Thermal Headroom\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			shutdownTemp := gpu.ShutdownTempC
 			if shutdownTemp <= 0 {
 				shutdownTemp = 90
 			}
 			slowdownTemp := gpu.SlowdownTempC
 			if slowdownTemp <= 0 {
 				slowdownTemp = 80
 			}
 			headroom := gpu.Scores.TempHeadroomC
 			thermalStatus := "✓ OK"
 			switch {
 			case headroom < 10:
 				thermalStatus = "⛔ CRITICAL"
 			case gpu.Steady.P95TempC >= slowdownTemp:
 				thermalStatus = "⚠ WARNING"
 			}
 			throttlePct := "-"
 			if gpu.Scores.ThermalThrottlePct > 0 {
 				throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
 			}
 			rows = append(rows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
 				fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
 				fmt.Sprintf("%.0f°C", slowdownTemp),
 				fmt.Sprintf("%.0f°C", shutdownTemp),
 				fmt.Sprintf("%.1f°C", headroom),
 				throttlePct,
 				thermalStatus,
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
 		b.WriteString("\n")
 	}
 	// Perspective 3: Power delivery
 	b.WriteString("### 3. Power Delivery\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			powerCap := "-"
 			if gpu.Scores.PowerCapThrottlePct > 0 {
 				powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
 			}
 			fanDuty := "-"
 			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
 				fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
 			}
 			powerStatus := "✓ OK"
 			if gpu.Scores.PowerCapThrottlePct > 5 {
 				powerStatus = "⚠ POWER LIMITED"
 			}
 			rows = append(rows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
 				powerCap,
 				fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
 				fanDuty,
 				powerStatus,
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
 		b.WriteString("\n")
 	}
 	// Perspective 4: Performance
 	b.WriteString("### 4. Performance\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			synthetic := "-"
 			if gpu.Scores.SyntheticScore > 0 {
 				synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
 			}
 			mixed := "-"
 			if gpu.Scores.MixedScore > 0 {
 				mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
 			}
 			mixedEff := "-"
 			if gpu.Scores.MixedEfficiency > 0 {
 				mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
 			}
 			topsPerSM := "-"
 			if gpu.Scores.TOPSPerSMPerGHz > 0 {
 				topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
 			}
 			rows = append(rows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
 				fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
 				synthetic, mixed, mixedEff, topsPerSM,
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
 		if len(result.PerformanceRampSteps) > 0 {
 			fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
 		}
 		b.WriteString("\n")
 	}
 	// Perspective 5: Anomaly flags
 	b.WriteString("### 5. Anomalies\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			eccCorr := "-"
 			if gpu.ECC.Corrected > 0 {
 				eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
 			}
 			syncBoost := "-"
 			if gpu.Scores.SyncBoostThrottlePct > 0 {
 				syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
 			}
 			powerVar := "OK"
 			if gpu.Scores.PowerSustainScore < 70 {
 				powerVar = "⚠ unstable"
 			}
 			thermalVar := "OK"
 			if gpu.Scores.ThermalSustainScore < 70 {
 				thermalVar = "⚠ unstable"
 			}
 			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
 		b.WriteString("\n")
 	}
 	// ── Per GPU detail ────────────────────────────────────────────────────────
 	b.WriteString("## Per-GPU Details\n\n")
 	for _, gpu := range result.GPUs {
 		name := strings.TrimSpace(gpu.Name)
 		if name == "" {
 			name = "Unknown GPU"
 		}
 		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
 		// Identity
 		if gpu.BusID != "" {
 			fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
 		}
 		if gpu.VBIOS != "" {
 			fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
 		}
 		if gpu.ComputeCapability != "" {
 			fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
 		}
 		if gpu.MultiprocessorCount > 0 {
 			fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
 		}
 		if gpu.PowerLimitW > 0 {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
 			fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
 				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
 				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
 			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
 		}
 		b.WriteString("\n")
 		// Steady-state telemetry
 		if benchmarkTelemetryAvailable(gpu.Steady) {
 			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
 			b.WriteString(fmtMDTable(
 				[]string{"", "Avg", "P95"},
 				[][]string{
 					{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
 					{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
 					{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
 					{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
 					{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
 				},
 			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
 		}
 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
 			var precRows [][]string
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
 				if !p.ECC.IsZero() {
 					eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
 					eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
 				}
 				status := p.Status
 				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
 				precRows = append(precRows, []string{
 					p.Precision, status,
 					fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
 					fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
 					fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
 					eccCorr, eccUncorr,
 				})
 			}
 			b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
 			b.WriteString("\n")
 		} else {
 			// Legacy: show combined-window variance.
 			fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
 				gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
 		}
 		// ECC summary
 		if !gpu.ECC.IsZero() {
 			fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
 				gpu.ECC.Corrected, gpu.ECC.Uncorrected)
 		}
 		// Throttle
 		throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
 		if throttle != "none" {
 			fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
 		}
 		// Precision results
 		if len(gpu.PrecisionResults) > 0 {
 			b.WriteString("**Precision results:**\n\n")
 			var presRows [][]string
 			for _, p := range gpu.PrecisionResults {
 				if p.Supported {
 					presRows = append(presRows, []string{
 						p.Name,
 						fmt.Sprintf("%.2f", p.TeraOpsPerSec),
 						fmt.Sprintf("×%.3g", p.Weight),
 						fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
 						fmt.Sprintf("%d", p.Lanes),
 						fmt.Sprintf("%d", p.Iterations),
 					})
 				} else {
 					presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
 				}
 			}
 			b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
 			b.WriteString("\n")
 		}
 		// Degradation / Notes
 		if len(gpu.DegradationReasons) > 0 {
 			fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
 		}
 		if len(gpu.Notes) > 0 {
 			b.WriteString("**Notes:**\n\n")
 			for _, note := range gpu.Notes {
 				fmt.Fprintf(&b, "- %s\n", note)
 			}
 			b.WriteString("\n")
 		}
 	}
 	// ── Interconnect ──────────────────────────────────────────────────────────
 	if result.Interconnect != nil {
 		b.WriteString("## Interconnect (NCCL)\n\n")
 		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
 			b.WriteString(fmtMDTable(
 				[]string{"Metric", "Avg", "Max"},
 				[][]string{
 					{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
 					{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
 				},
 			))
 			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
 		if len(result.Interconnect.Notes) > 0 {
 			b.WriteString("\n")
 		}
 	}
 	// ── Server Power ───────────────────────────────────────────────────────────
 	if sp := result.ServerPower; sp != nil {
 		title := "## Server Power\n\n"
 		if sp.Source != "" {
 			title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
 		}
 		b.WriteString(title)
 		if !sp.Available {
 			b.WriteString("Server power measurement unavailable.\n\n")
 		} else {
 			spRows := [][]string{
 				{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
 				{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
 				{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
 				{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
 			}
 			if sp.ReportingRatio > 0 {
 				spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
 			}
 			b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
 			b.WriteString("\n")
 		}
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
 		if len(sp.Notes) > 0 {
 			b.WriteString("\n")
 		}
 	}
 	// ── PSU Issues ────────────────────────────────────────────────────────────
 	if len(result.PSUIssues) > 0 {
 		b.WriteString("## PSU Issues\n\n")
 		b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
 		for _, issue := range result.PSUIssues {
 			fmt.Fprintf(&b, "- ⛔ %s\n", issue)
 		}
 		b.WriteString("\n")
 	}
 	// ── Cooling ───────────────────────────────────────────────────────────────
 	if cooling := result.Cooling; cooling != nil {
 		b.WriteString("## Cooling\n\n")
 		if cooling.Available {
 			dutyAvg, dutyP95 := "N/A", "N/A"
 			if cooling.FanDutyCycleAvailable {
 				dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
 				dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
 			}
 			b.WriteString(fmtMDTable(
 				[]string{"Metric", "Value"},
 				[][]string{
 					{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
 					{"Average fan duty cycle", dutyAvg},
 					{"P95 fan duty cycle", dutyP95},
 				},
 			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("Cooling telemetry unavailable.\n\n")
 		}
 		for _, note := range cooling.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
 		if len(cooling.Notes) > 0 {
 			b.WriteString("\n")
 		}
 	}
 	// ── Platform Scalability ──────────────────────────────────────────────────
 	if len(result.PerformanceRampSteps) > 0 {
 		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
 		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
 		var scalRows [][]string
 		for _, step := range result.PerformanceRampSteps {
 			scalRows = append(scalRows, []string{
 				fmt.Sprintf("%d", step.StepIndex),
 				joinIndexList(step.GPUIndices),
 				fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
 				fmt.Sprintf("%.1f%%", step.ScalabilityPct),
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
 		b.WriteString("\n")
 	}
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
 	b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
 	if result.Interconnect != nil {
 		b.WriteString("- `nccl-all-reduce.log`\n")
 	}
 	return b.String()
 }
 // formatThrottleLine renders throttle counters as human-readable percentages of
 // the steady-state window.  Only non-zero counters are shown.  When the steady
 // duration is unknown (0), raw seconds are shown instead.
 func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
 	type counter struct {
 		label string
 		us    uint64
 	}
 	counters := []counter{
 		{"sw_power", t.SWPowerCapUS},
 		{"sw_thermal", t.SWThermalSlowdownUS},
 		{"sync_boost", t.SyncBoostUS},
 		{"hw_thermal", t.HWThermalSlowdownUS},
 		{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
 	}
 	var parts []string
 	for _, c := range counters {
 		if c.us == 0 {
 			continue
 		}
 		sec := float64(c.us) / 1e6
 		if steadyDurationSec > 0 {
 			pct := sec / steadyDurationSec * 100
 			parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
 		} else if sec < 1 {
 			parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
 		} else {
 			parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
 		}
 	}
 	if len(parts) == 0 {
 		return "none"
 	}
 	return strings.Join(parts, "  ")
 }
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
 	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
 	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
 	var best float64
 	for i, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
 		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
 		if i == 0 || gpu.Scores.CompositeScore > best {
 			best = gpu.Scores.CompositeScore
 		}
 	}
 	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
 	if result.Interconnect != nil {
 		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
 		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
 	}
 	return b.String()
 }
--- a/audit/internal/platform/benchmark_table.go
+++ b/audit/internal/platform/benchmark_table.go
@@ -0,0 +1,75 @@
 package platform
 import (
 	"strings"
 )
 // fmtMDTable renders a markdown table with column widths padded so the table
 // is readable as plain text without a markdown renderer.
 //
 // headers contains the column header strings.
 // rows contains data rows; each row must have the same number of cells as headers.
 // Cells with fewer entries than headers are treated as empty.
 func fmtMDTable(headers []string, rows [][]string) string {
 	ncols := len(headers)
 	if ncols == 0 {
 		return ""
 	}
 	// Compute max width per column.
 	widths := make([]int, ncols)
 	for i, h := range headers {
 		if len(h) > widths[i] {
 			widths[i] = len(h)
 		}
 	}
 	for _, row := range rows {
 		for i := 0; i < ncols; i++ {
 			cell := ""
 			if i < len(row) {
 				cell = row[i]
 			}
 			if len(cell) > widths[i] {
 				widths[i] = len(cell)
 			}
 		}
 	}
 	var b strings.Builder
 	// Header row.
 	b.WriteByte('|')
 	for i, h := range headers {
 		b.WriteByte(' ')
 		b.WriteString(h)
 		b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
 		b.WriteString(" |")
 	}
 	b.WriteByte('\n')
 	// Separator row.
 	b.WriteByte('|')
 	for i := range headers {
 		b.WriteString(strings.Repeat("-", widths[i]+2))
 		b.WriteByte('|')
 	}
 	b.WriteByte('\n')
 	// Data rows.
 	for _, row := range rows {
 		b.WriteByte('|')
 		for i := 0; i < ncols; i++ {
 			cell := ""
 			if i < len(row) {
 				cell = row[i]
 			}
 			b.WriteByte(' ')
 			b.WriteString(cell)
 			b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
 			b.WriteString(" |")
 		}
 		b.WriteByte('\n')
 	}
 	return b.String()
 }
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,582 @@
 package platform
 import (
 	"context"
 	"fmt"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 )
 func TestResolveBenchmarkProfile(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name    string
 		profile string
 		want    benchmarkProfileSpec
 	}{
 		{
 			name:    "default",
 			profile: "",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
 		},
 		{
 			name:    "stability",
 			profile: "stability",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
 		},
 		{
 			name:    "overnight",
 			profile: "overnight",
 			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
 		},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got := resolveBenchmarkProfile(tc.profile)
 			if got != tc.want {
 				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
 			}
 		})
 	}
 }
 func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	t.Parallel()
 	labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if len(labels) != 5 || len(phases) != 5 {
 		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
 	}
 	if mixedPhaseSec != 300 {
 		t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
 	}
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
 	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
 func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	t.Parallel()
 	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if basePhaseSec != 300 {
 		t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
 	}
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
 	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
 func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	t.Parallel()
 	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if basePhaseSec != 3600 {
 		t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
 	}
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
 	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
 func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 	t.Parallel()
 	phases := []benchmarkPlannedPhase{
 		{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
 		{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
 		{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
 	}
 	rows := []GPUMetricRow{
 		{ElapsedSec: 5},
 		{ElapsedSec: 15},
 		{ElapsedSec: 25},
 		{ElapsedSec: 65},
 	}
 	got := splitBenchmarkRowsByPlannedPhase(rows, phases)
 	if len(got["fp8"]) != 1 {
 		t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
 	}
 	if len(got["fp16"]) != 1 {
 		t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
 	}
 	if len(got["mixed"]) != 2 {
 		t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
 	}
 }
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()
 	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 }
 func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name       string
 		raw        string
 		wantStatus string
 	}{
 		{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
 		{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
 		{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
 			if got != tc.wantStatus {
 				t.Fatalf("status=%q want %q", got, tc.wantStatus)
 			}
 		})
 	}
 }
 func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
 	t.Parallel()
 	before := BenchmarkThrottleCounters{}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
 		t.Fatalf("sw_power_cap should be ignored, got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
 		t.Fatalf("hw_power_brake should be ignored, got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
 		t.Fatalf("hw_thermal mismatch: got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
 		t.Fatalf("sw_thermal mismatch: got %q", got)
 	}
 }
 func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
 	oldGeteuid := benchmarkGeteuid
 	oldReset := benchmarkResetNvidiaGPU
 	benchmarkGeteuid = func() int { return 1000 }
 	benchmarkResetNvidiaGPU = func(int) (string, error) {
 		t.Fatal("unexpected reset call")
 		return "", nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		benchmarkResetNvidiaGPU = oldReset
 	})
 	var logs []string
 	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
 		logs = append(logs, line)
 	})
 	if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
 		t.Fatalf("logs=%q want substring %q", got, want)
 	}
 	if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
 		t.Fatalf("failed=%v want [0 2]", failed)
 	}
 }
 func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
 	oldGeteuid := benchmarkGeteuid
 	oldSleep := benchmarkSleep
 	oldReset := benchmarkResetNvidiaGPU
 	benchmarkGeteuid = func() int { return 0 }
 	benchmarkSleep = func(time.Duration) {}
 	var calls []int
 	benchmarkResetNvidiaGPU = func(index int) (string, error) {
 		calls = append(calls, index)
 		return "ok\n", nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		benchmarkSleep = oldSleep
 		benchmarkResetNvidiaGPU = oldReset
 	})
 	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
 	if len(failed) != 0 {
 		t.Fatalf("failed=%v want no failures", failed)
 	}
 	if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
 		t.Fatalf("calls=%v want %s", calls, want)
 	}
 }
 func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
 	oldGeteuid := benchmarkGeteuid
 	oldSleep := benchmarkSleep
 	oldReset := benchmarkResetNvidiaGPU
 	benchmarkGeteuid = func() int { return 0 }
 	benchmarkSleep = func(time.Duration) {}
 	benchmarkResetNvidiaGPU = func(index int) (string, error) {
 		if index == 5 {
 			return "busy\n", exec.ErrNotFound
 		}
 		return "ok\n", nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		benchmarkSleep = oldSleep
 		benchmarkResetNvidiaGPU = oldReset
 	})
 	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
 	if got, want := fmt.Sprint(failed), "[5]"; got != want {
 		t.Fatalf("failed=%v want %s", failed, want)
 	}
 }
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()
 	opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
 		Profile: "stability",
 		RunNCCL: false,
 	})
 	if opts.Profile != NvidiaBenchmarkProfileStability {
 		t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
 	}
 	if opts.RunNCCL {
 		t.Fatalf("RunNCCL should stay false when explicitly disabled")
 	}
 }
 func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name string
 		info benchmarkGPUInfo
 		want int
 	}{
 		{
 			name: "prefers default tdp over current derated limit",
 			info: benchmarkGPUInfo{
 				PowerLimitW:        500,
 				DefaultPowerLimitW: 600,
 				MaxPowerLimitW:     600,
 			},
 			want: 600,
 		},
 		{
 			name: "caps default tdp to reported max limit",
 			info: benchmarkGPUInfo{
 				PowerLimitW:        500,
 				DefaultPowerLimitW: 700,
 				MaxPowerLimitW:     650,
 			},
 			want: 650,
 		},
 		{
 			name: "falls back to current limit when default missing",
 			info: benchmarkGPUInfo{
 				PowerLimitW:    525,
 				MaxPowerLimitW: 600,
 			},
 			want: 525,
 		},
 		{
 			name: "falls back to max limit when only that is known",
 			info: benchmarkGPUInfo{
 				MaxPowerLimitW: 575,
 			},
 			want: 575,
 		},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
 				t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
 			}
 		})
 	}
 }
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()
 	raw := strings.Join([]string{
 		"loader=bee-gpu-burn",
 		"[gpu 0] device=NVIDIA H100",
 		"[gpu 0] compute_capability=9.0",
 		"[gpu 0] backend=cublasLt",
 		"[gpu 0] duration_s=10",
 		"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
 		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
 		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
 		"[gpu 0] int8_tensor_iterations=80",
 		"[gpu 0] fp16_tensor_iterations=200",
 		"[gpu 0] fp8_e4m3_iterations=50",
 		"[gpu 0] status=OK",
 	}, "\n")
 	got := parseBenchmarkBurnLog(raw)
 	if got.Backend != "cublasLt" {
 		t.Fatalf("backend=%q want cublasLt", got.Backend)
 	}
 	if got.ComputeCapability != "9.0" {
 		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
 	}
 	if len(got.Profiles) != 3 {
 		t.Fatalf("profiles=%d want 3", len(got.Profiles))
 	}
 	if got.Profiles[0].TeraOpsPerSec <= 0 {
 		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
 	}
 	if got.Profiles[0].Category != "fp16_bf16" {
 		t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
 	}
 	if got.Profiles[1].Category != "fp8" {
 		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
 	}
 	if got.Profiles[2].Category != "int8" {
 		t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
 	}
 	if got.Profiles[2].Weight != 0.25 {
 		t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
 	}
 }
 func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 	t.Parallel()
 	result := NvidiaBenchmarkResult{
 		BenchmarkVersion:   benchmarkVersion,
 		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
 		OverallStatus:      "PARTIAL",
 		SelectedGPUIndices: []int{0},
 		Normalization: BenchmarkNormalization{
 			Status: "partial",
 		},
 		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
 		GPUs: []BenchmarkGPUResult{
 			{
 				Index:  0,
 				Name:   "NVIDIA H100",
 				Status: "OK",
 				Steady: BenchmarkTelemetrySummary{
 					AvgPowerW:           680,
 					AvgTempC:            79,
 					AvgGraphicsClockMHz: 1725,
 					P95PowerW:           700,
 					P95TempC:            82,
 					P95GraphicsClockMHz: 1800,
 				},
 				Scores: BenchmarkScorecard{
 					ComputeScore:        1200,
 					PowerSustainScore:   96,
 					ThermalSustainScore: 88,
 					StabilityScore:      92,
 					CompositeScore:      1176,
 				},
 				PrecisionResults: []BenchmarkPrecisionResult{
 					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
 				},
 				Throttle: BenchmarkThrottleCounters{
 					SWPowerCapUS: 1000000,
 				},
 				DegradationReasons: []string{"power_capped"},
 			},
 		},
 		Cooling: &BenchmarkCoolingSummary{
 			Available:             true,
 			AvgFanRPM:             9200,
 			FanDutyCycleAvailable: true,
 			AvgFanDutyCyclePct:    47.5,
 			P95FanDutyCyclePct:    62.0,
 		},
 	}
 	report := renderBenchmarkReport(result)
 	for _, needle := range []string{
 		"Executive Summary",
 		"GPU 0 spent measurable time under SW power cap.",
 		"1176.00",
 		"fp16_tensor",
 		"700.00",
 		"Cooling",
 		"Average fan duty cycle",
 		"47.5%",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
 }
 func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	t.Parallel()
 	report := renderBenchmarkReport(NvidiaBenchmarkResult{
 		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
 		OverallStatus:      "OK",
 		SelectedGPUIndices: []int{0},
 		Normalization: BenchmarkNormalization{
 			Status: "full",
 		},
 	})
 	for _, needle := range []string{
 		"gpu-metrics.csv",
 		"gpu-metrics.html",
 		"gpu-burn.log",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
 }
 func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	t.Parallel()
 	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
 		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
 			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
 			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
 			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
 		},
 		PrecisionResults: []BenchmarkPrecisionResult{
 			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
 			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
 			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
 		},
 	})
 	if score.SyntheticScore != 100 {
 		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
 	}
 	if score.MixedScore != 50 {
 		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
 	}
 }
 func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Min Power Limit                       : 200.00 W
    Max Power Limit                       : 600.00 W
    Default Power Limit                   : 575.00 W
    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
    Max Clocks
        Graphics                          : 2430 MHz
        SM                                : 2430 MHz
        Memory                            : 12481 MHz
        Video                             : 2107 MHz
 GPU 00000000:4F:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Max Clocks
        Graphics                          : 2430 MHz
        Memory                            : 12481 MHz
 `)
 	infoByIndex := map[int]benchmarkGPUInfo{
 		0: {Index: 0, BusID: "00000000:4E:00.0"},
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}
 	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[0].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
 	}
 	if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
 	}
 	if infoByIndex[0].MaxPowerLimitW != 600 {
 		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
 	}
 	if infoByIndex[0].DefaultPowerLimitW != 575 {
 		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
 	}
 	if infoByIndex[0].PowerLimitW != 560 {
 		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
 	}
 }
 func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Min Power Limit                       : 100.00 W
    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
 		0: {
 			Index:               0,
 			BusID:               "00000000:4E:00.0",
 			MaxGraphicsClockMHz: 2430,
 			MaxMemoryClockMHz:   12481,
 			MinPowerLimitW:      200,
 			MaxPowerLimitW:      600,
 		},
 	}
 	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
 	}
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,536 @@
 package platform
 import "time"
 // BenchmarkHostConfig holds static CPU and memory configuration captured at
 // benchmark start. Useful for correlating results across runs on different hardware.
 type BenchmarkHostConfig struct {
 	CPUModel    string  `json:"cpu_model,omitempty"`
 	CPUSockets  int     `json:"cpu_sockets,omitempty"`
 	CPUCores    int     `json:"cpu_cores,omitempty"`
 	CPUThreads  int     `json:"cpu_threads,omitempty"`
 	MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
 }
 // BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
 // steady-state phase. High or unstable CPU load during a GPU benchmark may
 // indicate a competing workload or a CPU-bound driver bottleneck.
 type BenchmarkCPULoad struct {
 	AvgPct  float64 `json:"avg_pct"`
 	MaxPct  float64 `json:"max_pct"`
 	P95Pct  float64 `json:"p95_pct"`
 	Samples int     `json:"samples"`
 	// Status is "ok", "high", or "unstable".
 	Status string `json:"status"`
 	Note   string `json:"note,omitempty"`
 }
 // BenchmarkCoolingSummary captures fan telemetry averaged across the full
 // benchmark run.
 type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
 }
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )
 const (
 	BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
 	BenchmarkPowerEngineTargetedPower  = "targeted_power"
 )
 // Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
 // Rule: when changing profile phase durations in resolveBenchmarkProfile(),
 // re-measure from actual task logs and update the constants here.
 //
 // Sources:
 //   - BenchmarkEstimatedPerfStandardSec:   MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
 //   - BenchmarkEstimatedPerfStabilitySec:  xFusion v8.22 ramp 1-8: 5532 s
 //   - BenchmarkEstimatedPerfOvernightSec:  derived from profile phases (SteadySec=27000)
 //   - BenchmarkEstimatedPowerStandardSec:  MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
 //   - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
 const (
 	// Performance Benchmark (bee-gpu-burn).
 	// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
 	// Sequential per-GPU mode scales approximately linearly.
 	BenchmarkEstimatedPerfStandardSec  = 960  // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
 	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
 	BenchmarkEstimatedPerfOvernightSec = 8 * 3600
 	// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
 	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
 	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
 	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
 	BenchmarkEstimatedPowerOvernightSec = 3 * 3600
 )
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
 	ServerPowerSource string
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }
 const (
 	BenchmarkPowerSourceDCMI        = "dcmi"
 	BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
 )
 type BenchmarkPowerAutotuneConfig struct {
 	Version           int       `json:"version"`
 	UpdatedAt         time.Time `json:"updated_at"`
 	SelectedSource    string    `json:"selected_source"`
 	BenchmarkKind     string    `json:"benchmark_kind,omitempty"`
 	Profile           string    `json:"profile,omitempty"`
 	IdleDurationSec   int       `json:"idle_duration_sec,omitempty"`
 	LoadDurationSec   int       `json:"load_duration_sec,omitempty"`
 	SampleIntervalSec int       `json:"sample_interval_sec,omitempty"`
 	Confidence        float64   `json:"confidence,omitempty"`
 	Reason            string    `json:"reason,omitempty"`
 }
 type SystemPowerSourceDecision struct {
 	Configured      bool      `json:"configured"`
 	SelectedSource  string    `json:"selected_source,omitempty"`
 	EffectiveSource string    `json:"effective_source,omitempty"`
 	Mode            string    `json:"mode,omitempty"` // autotuned, fallback, degraded
 	Reason          string    `json:"reason,omitempty"`
 	ConfiguredAt    time.Time `json:"configured_at,omitempty"`
 }
 type BenchmarkPowerAutotuneResult struct {
 	GeneratedAt         time.Time                         `json:"generated_at"`
 	Hostname            string                            `json:"hostname,omitempty"`
 	ServerModel         string                            `json:"server_model,omitempty"`
 	BenchmarkKind       string                            `json:"benchmark_kind,omitempty"`
 	Profile             string                            `json:"profile,omitempty"`
 	Status              string                            `json:"status"`
 	IdleDurationSec     int                               `json:"idle_duration_sec"`
 	LoadDurationSec     int                               `json:"load_duration_sec"`
 	SampleIntervalSec   int                               `json:"sample_interval_sec"`
 	SelectedSource      string                            `json:"selected_source,omitempty"`
 	IdleValidationError string                            `json:"idle_validation_error,omitempty"`
 	IdleValidation      *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
 	GPUPowerIdleW       float64                           `json:"gpu_power_idle_w,omitempty"`
 	GPUPowerLoadW       float64                           `json:"gpu_power_load_w,omitempty"`
 	Candidates          []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
 	Notes               []string                          `json:"notes,omitempty"`
 	Config              *BenchmarkPowerAutotuneConfig     `json:"config,omitempty"`
 }
 type BenchmarkPowerAutotuneValidation struct {
 	Valid          bool    `json:"valid"`
 	GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
 	GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
 	CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
 	CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
 	GPUSamples     int     `json:"gpu_samples,omitempty"`
 	CPUSamples     int     `json:"cpu_samples,omitempty"`
 	Reason         string  `json:"reason,omitempty"`
 }
 type BenchmarkPowerAutotuneCandidate struct {
 	Source         string  `json:"source"`
 	IdleAvgW       float64 `json:"idle_avg_w,omitempty"`
 	LoadAvgW       float64 `json:"load_avg_w,omitempty"`
 	DeltaW         float64 `json:"delta_w,omitempty"`
 	Samples        int     `json:"samples,omitempty"`
 	RelativeError  float64 `json:"relative_error,omitempty"`
 	Confidence     float64 `json:"confidence,omitempty"`
 	Selected       bool    `json:"selected,omitempty"`
 	Available      bool    `json:"available"`
 	SelectionNotes string  `json:"selection_notes,omitempty"`
 }
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion string    `json:"benchmark_version"`
 	GeneratedAt      time.Time `json:"generated_at"`
 	Hostname         string    `json:"hostname,omitempty"`
 	ServerModel      string    `json:"server_model,omitempty"`
 	BenchmarkProfile string    `json:"benchmark_profile"`
 	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
 	RampStep         int       `json:"ramp_step,omitempty"`
 	RampTotal        int       `json:"ramp_total,omitempty"`
 	RampRunID        string    `json:"ramp_run_id,omitempty"`
 	ScalabilityScore float64   `json:"scalability_score,omitempty"`
 	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
 	// 100% = each added GPU contributes exactly its single-card throughput.
 	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
 	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
 	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
 	OverallStatus        string                       `json:"overall_status"`
 	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
 	Findings             []string                     `json:"findings,omitempty"`
 	Warnings             []string                     `json:"warnings,omitempty"`
 	Normalization        BenchmarkNormalization       `json:"normalization"`
 	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
 	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
 	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
 	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
 	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
 	// sensor states before and after the benchmark run. Empty when IPMI is
 	// unavailable or no PSU faults occurred during the test.
 	PSUIssues []string `json:"psu_issues,omitempty"`
 }
 type BenchmarkNormalization struct {
 	Status string                      `json:"status"`
 	Notes  []string                    `json:"notes,omitempty"`
 	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
 }
 type BenchmarkNormalizationGPU struct {
 	Index                 int      `json:"index"`
 	PersistenceMode       string   `json:"persistence_mode,omitempty"`
 	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
 	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
 	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
 	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
 }
 type BenchmarkGPUResult struct {
 	Index               int     `json:"index"`
 	UUID                string  `json:"uuid,omitempty"`
 	Name                string  `json:"name,omitempty"`
 	BusID               string  `json:"bus_id,omitempty"`
 	VBIOS               string  `json:"vbios,omitempty"`
 	ComputeCapability   string  `json:"compute_capability,omitempty"`
 	Backend             string  `json:"backend,omitempty"`
 	Status              string  `json:"status"`
 	PowerLimitW         float64 `json:"power_limit_w,omitempty"`
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
 	// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
 	// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
 	ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
 	// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
 	// Fallback: 80°C.
 	SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
 	// the hardware default limit, which bee-gpu-burn cannot reach.
 	CalibratedPeakPowerW   float64                         `json:"calibrated_peak_power_w,omitempty"`
 	CalibratedPeakTempC    float64                         `json:"calibrated_peak_temp_c,omitempty"`
 	PowerCalibrationTries  int                             `json:"power_calibration_tries,omitempty"`
 	MaxGraphicsClockMHz    float64                         `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                         `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                         `json:"max_memory_clock_mhz,omitempty"`
 	LockedGraphicsClockMHz float64                         `json:"locked_graphics_clock_mhz,omitempty"`
 	LockedMemoryClockMHz   float64                         `json:"locked_memory_clock_mhz,omitempty"`
 	Baseline               BenchmarkTelemetrySummary       `json:"baseline"`
 	Steady                 BenchmarkTelemetrySummary       `json:"steady"`
 	PrecisionSteady        []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
 	PrecisionFailures      []string                        `json:"precision_failures,omitempty"`
 	Cooldown               BenchmarkTelemetrySummary       `json:"cooldown"`
 	Throttle               BenchmarkThrottleCounters       `json:"throttle_counters"`
 	// ECC error delta accumulated over the full benchmark (all phases combined).
 	ECC                BenchmarkECCCounters       `json:"ecc,omitempty"`
 	PrecisionResults   []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
 	Scores             BenchmarkScorecard         `json:"scores"`
 	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
 	Notes              []string                   `json:"notes,omitempty"`
 	// CoolingWarning is non-empty when a thermal throttle event occurred with
 	// a clock drop ≥20% while server fans were not at 100% duty cycle.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 type BenchmarkTelemetrySummary struct {
 	DurationSec         float64 `json:"duration_sec"`
 	Samples             int     `json:"samples"`
 	AvgTempC            float64 `json:"avg_temp_c"`
 	P95TempC            float64 `json:"p95_temp_c"`
 	AvgPowerW           float64 `json:"avg_power_w"`
 	P95PowerW           float64 `json:"p95_power_w"`
 	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
 	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
 	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
 	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
 	AvgUsagePct         float64 `json:"avg_usage_pct"`
 	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
 	ClockCVPct          float64 `json:"clock_cv_pct"`
 	PowerCVPct          float64 `json:"power_cv_pct"`
 	TempCVPct           float64 `json:"temp_cv_pct"`
 	ClockDriftPct       float64 `json:"clock_drift_pct"`
 }
 type BenchmarkThrottleCounters struct {
 	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
 	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
 	SyncBoostUS            uint64 `json:"sync_boost_us"`
 	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
 	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
 }
 // BenchmarkECCCounters holds ECC error counts sampled at a point in time.
 // Corrected = single-bit errors fixed by ECC (DRAM degradation).
 // Uncorrected = double-bit errors that could not be corrected (serious fault).
 // Both are volatile (since last driver reset), not persistent.
 type BenchmarkECCCounters struct {
 	Corrected   uint64 `json:"corrected"`
 	Uncorrected uint64 `json:"uncorrected"`
 }
 func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
 func (e BenchmarkECCCounters) IsZero() bool  { return e.Corrected == 0 && e.Uncorrected == 0 }
 type BenchmarkPrecisionResult struct {
 	Name          string  `json:"name"`
 	Category      string  `json:"category"`
 	Supported     bool    `json:"supported"`
 	Lanes         int     `json:"lanes,omitempty"`
 	M             uint64  `json:"m,omitempty"`
 	N             uint64  `json:"n,omitempty"`
 	K             uint64  `json:"k,omitempty"`
 	Iterations    uint64  `json:"iterations,omitempty"`
 	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
 	// Weight is the fp32-equivalence factor for this precision category.
 	// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
 	// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
 	Weight                float64 `json:"weight,omitempty"`
 	WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
 	Notes                 string  `json:"notes,omitempty"`
 }
 type BenchmarkScorecard struct {
 	ComputeScore float64 `json:"compute_score"`
 	// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
 	// steady phases (each precision ran alone, full GPU dedicated).
 	SyntheticScore float64 `json:"synthetic_score,omitempty"`
 	// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
 	// (all precisions competing simultaneously — closer to real workloads).
 	MixedScore float64 `json:"mixed_score,omitempty"`
 	// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
 	// sustains throughput under concurrent mixed-precision load.
 	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
 	// StabilityScore: fraction of steady-state time the GPU spent throttling
 	// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
 	StabilityScore float64 `json:"stability_score"`
 	// Throttle breakdown — percentage of steady-state time in each throttle type.
 	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
 	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
 	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
 	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
 	// Temperature headroom: distance to the 100°C destruction threshold.
 	// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
 	// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
 	TempHeadroomC float64 `json:"temp_headroom_c"`
 	InterconnectScore float64 `json:"interconnect_score"`
 	// ServerQualityScore (0–100) reflects server infrastructure quality independent
 	// of GPU model. Combines throttle time, power variance, and temp variance.
 	// Use this to compare servers with the same GPU, or to flag a bad server
 	// that throttles an otherwise fast GPU.
 	ServerQualityScore float64 `json:"server_quality_score"`
 	// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
 	// A throttling GPU will score lower here automatically — no quality multiplier.
 	CompositeScore float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }
 // BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
 // during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
 // so benchmark and audit data can be correlated by slot.
 type BenchmarkPSUSlotPower struct {
 	InputW  *float64 `json:"input_w,omitempty"`  // AC wall input (PSUx_POWER_IN)
 	OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
 	Status  string   `json:"status,omitempty"`
 }
 // BenchmarkServerPower captures server-side power from multiple independent
 // sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
 // GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
 // covers only a subset of installed PSUs (partial coverage).
 //
 // Source legend:
 //   - DCMI      — `ipmitool dcmi power reading`; fast but may miss PSUs
 //   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
 //   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
 	Available         bool    `json:"available"`
 	Source            string  `json:"source,omitempty"`
 	Mode              string  `json:"mode,omitempty"`
 	Reason            string  `json:"reason,omitempty"`
 	SampleIntervalSec int     `json:"sample_interval_sec,omitempty"`
 	IdleW             float64 `json:"idle_w,omitempty"`   // DCMI at idle
 	LoadedW           float64 `json:"loaded_w,omitempty"` // DCMI at peak load
 	DeltaW            float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
 	GPUReportedSumW   float64 `json:"gpu_reported_sum_w,omitempty"`
 	ReportingRatio    float64 `json:"reporting_ratio,omitempty"`
 	// PSU AC input sum — sampled at idle and at peak load using collector's
 	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
 	PSUInputIdleW   float64 `json:"psu_input_idle_w,omitempty"`
 	PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
 	// PSU DC output sum — power delivered to server internals after conversion.
 	PSUOutputIdleW   float64 `json:"psu_output_idle_w,omitempty"`
 	PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
 	// Per-slot PSU readings at idle and at peak load.
 	// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
 	PSUSlotReadingsIdle   map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
 	PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
 	// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
 	// PCIe slot delivery only (excludes 16-pin connector power).
 	GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
 	// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
 	// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
 	DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
 	Notes []string `json:"notes,omitempty"`
 }
 // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
 // during a dedicated single-precision steady window.  Because only one kernel
 // type runs at a time the PowerCVPct here is a genuine stability signal.
 type BenchmarkPrecisionSteadyPhase struct {
 	Precision             string                    `json:"precision"` // e.g. "fp8", "fp16", "fp32"
 	Status                string                    `json:"status,omitempty"`
 	Steady                BenchmarkTelemetrySummary `json:"steady"`
 	TeraOpsPerSec         float64                   `json:"teraops_per_sec,omitempty"`
 	WeightedTeraOpsPerSec float64                   `json:"weighted_teraops_per_sec,omitempty"`
 	// ECC errors accumulated during this precision phase only.
 	// Non-zero corrected = stress-induced DRAM errors for this kernel type.
 	// Any uncorrected = serious fault triggered by this precision workload.
 	ECC   BenchmarkECCCounters `json:"ecc,omitempty"`
 	Notes string               `json:"notes,omitempty"`
 }
 type BenchmarkInterconnectResult struct {
 	Status             string   `json:"status"`
 	Attempted          bool     `json:"attempted"`
 	Supported          bool     `json:"supported"`
 	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
 	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
 	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
 	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
 	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
 	Notes              []string `json:"notes,omitempty"`
 }
 type NvidiaPowerBenchResult struct {
 	BenchmarkVersion     string                 `json:"benchmark_version"`
 	GeneratedAt          time.Time              `json:"generated_at"`
 	Hostname             string                 `json:"hostname,omitempty"`
 	ServerModel          string                 `json:"server_model,omitempty"`
 	BenchmarkProfile     string                 `json:"benchmark_profile"`
 	SelectedGPUIndices   []int                  `json:"selected_gpu_indices"`
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
 	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
 	// cumulative thermal ramp. Represents the actual sustained power budget of
 	// this server under full GPU load. Use for rack power planning.
 	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
 	// ServerPower captures IPMI server power delta (idle→loaded) measured in
 	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
 	// actual wall-power draw as seen by the server's power supply.
 	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
 	Findings    []string              `json:"findings,omitempty"`
 	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
 	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
 	// sensor states before and after the power benchmark run. Empty when IPMI is
 	// unavailable or no PSU faults occurred during the test.
 	PSUIssues []string `json:"psu_issues,omitempty"`
 }
 type NvidiaPowerBenchGPU struct {
 	Index              int     `json:"index"`
 	Name               string  `json:"name,omitempty"`
 	BusID              string  `json:"bus_id,omitempty"`
 	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
 	// AppliedPowerLimitW is the stable limit found during single-card calibration.
 	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
 	// StablePowerLimitW is the final fixed limit for this GPU after the
 	// cumulative thermal ramp. This is the limit at which the GPU operated
 	// stably with all other GPUs running simultaneously at their own limits.
 	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
 	// additional derating.
 	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 	// ServerLoadedW is the IPMI server power reading captured during this
 	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
 	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
 	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 	// Telemetry holds the aggregated stats from the final converged calibration
 	// attempt for this GPU (temperature, power, fan, clock percentiles).
 	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
 	// Fan state sampled at the end of single-card calibration.
 	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
 	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 }
 type NvidiaPowerBenchStep struct {
 	StepIndex  int   `json:"step_index"`
 	GPUIndices []int `json:"gpu_indices"`
 	// NewGPUIndex is the GPU whose stable limit was searched in this step.
 	NewGPUIndex int `json:"new_gpu_index"`
 	// NewGPUStableLimitW is the stable power limit found for the new GPU.
 	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
 	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
 	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	Notes               []string `json:"notes,omitempty"`
 	// ServerLoadedW is the IPMI server power reading captured during this
 	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
 	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
 	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 	// PSU slot readings sampled at end of this ramp step.
 	PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
 	// Fan state at end of this ramp step.
 	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
 	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 	// Per-GPU telemetry from this step's calibration, keyed by GPU index.
 	PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
 }
 // NvidiaPerformanceRampStep holds per-step performance data for the
 // scalability ramp-up phase of the performance benchmark.
 type NvidiaPerformanceRampStep struct {
 	StepIndex  int   `json:"step_index"`
 	GPUIndices []int `json:"gpu_indices"`
 	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
 	// TOPS from dedicated single-precision phases) across all GPUs in this step.
 	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
 	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
 	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
 	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
 	ScalabilityPct float64  `json:"scalability_pct"`
 	Status         string   `json:"status"`
 	Notes          []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -13,19 +13,27 @@ import (
 // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
 type GPUMetricRow struct {
-	ElapsedSec  float64 `json:"elapsed_sec"`
+	Stage                 string  `json:"stage,omitempty"`
-	GPUIndex    int     `json:"index"`
+	StageStartSec         float64 `json:"stage_start_sec,omitempty"`
-	TempC       float64 `json:"temp_c"`
+	StageEndSec           float64 `json:"stage_end_sec,omitempty"`
-	UsagePct    float64 `json:"usage_pct"`
+	ElapsedSec            float64 `json:"elapsed_sec"`
-	MemUsagePct float64 `json:"mem_usage_pct"`
+	GPUIndex              int     `json:"index"`
-	PowerW      float64 `json:"power_w"`
+	TempC                 float64 `json:"temp_c"`
-	ClockMHz    float64 `json:"clock_mhz"`
+	UsagePct              float64 `json:"usage_pct"`
 	MemUsagePct           float64 `json:"mem_usage_pct"`
 	PowerW                float64 `json:"power_w"`
 	ClockMHz              float64 `json:"clock_mhz"`
 	MemClockMHz           float64 `json:"mem_clock_mhz"`
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }
 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	args := []string{
-		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
+		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -46,7 +54,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			continue
 		}
 		parts := strings.Split(line, ", ")
-		if len(parts) < 6 {
+		if len(parts) < 7 {
 			continue
 		}
 		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +65,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			MemUsagePct: parseGPUFloat(parts[3]),
 			PowerW:      parseGPUFloat(parts[4]),
 			ClockMHz:    parseGPUFloat(parts[5]),
 			MemClockMHz: parseGPUFloat(parts[6]),
 		})
 	}
 	return rows, nil
@@ -139,14 +148,28 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
+		dutyAvail := 0
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
+		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
 		dutyEstimated := 0
 		if r.FanDutyCycleEstimated {
 			dutyEstimated = 1
 		}
 		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
 			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
 type gpuMetricStageSpan struct {
 	Name  string
 	Start float64
 	End   float64
 }
 // WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
 func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 	// Group by GPU index preserving order.
@@ -161,9 +184,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
 	}
 	stageSpans := buildGPUMetricStageSpans(rows)
 	stageColorByName := make(map[string]string, len(stageSpans))
 	for i, span := range stageSpans {
 		stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
 	}
 	var legend strings.Builder
 	if len(stageSpans) > 0 {
 		legend.WriteString(`<div class="stage-legend">`)
 		for _, span := range stageSpans {
 			fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
 				stageColorByName[span.Name], gpuHTMLEscape(span.Name))
 		}
 		legend.WriteString(`</div>`)
 	}
 	var svgs strings.Builder
 	for _, gpuIdx := range order {
-		svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
+		svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
 		svgs.WriteString("\n")
 	}
@@ -173,21 +212,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 <meta charset="utf-8">
 <title>GPU Stress Test Metrics</title>
 <style>
-body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
+:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
-h1 { text-align: center; color: #333; margin: 0 0 8px; }
+*{box-sizing:border-box}
-p  { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
+body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
 .page{padding:24px}
 .card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
 .card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
 .card-body{padding:16px}
 h1{font-size:22px;margin:0 0 6px}
 p{color:var(--muted);font-size:13px;margin:0 0 16px}
 .stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
 .stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
 .stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
 .chart-block{margin-top:16px}
 </style>
 </head><body>
 <div class="page">
 <div class="card">
 <div class="card-head">GPU Stress Test Metrics</div>
 <div class="card-body">
 <h1>GPU Stress Test Metrics</h1>
 <p>Generated %s</p>
 %s
-</body></html>`, ts, svgs.String())
+<div class="chart-block">%s</div>
 </div>
 </div>
 </div>
 </body></html>`, ts, legend.String(), svgs.String())
 	return os.WriteFile(path, []byte(html), 0644)
 }
 // drawGPUChartSVG generates a self-contained SVG chart for one GPU.
-func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
+func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
 	// Layout
 	const W, H = 960, 520
 	const plotX1 = 120 // usage axis / chart left border
@@ -197,7 +254,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	const PW = plotX2 - plotX1
 	const PH = plotY2 - plotY1
 	// Outer axes
-	const tempAxisX = 60  // temp axis line
+	const tempAxisX = 60   // temp axis line
 	const clockAxisX = 900 // clock axis line
 	colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
@@ -282,6 +339,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	}
 	b.WriteString("</g>\n")
 	// Stage backgrounds
 	for _, span := range stageSpans {
 		x1 := xv(span.Start)
 		x2 := xv(span.End)
 		if x2 < x1 {
 			x1, x2 = x2, x1
 		}
 		if x2-x1 < 1 {
 			x2 = x1 + 1
 		}
 		color := stageColorByName[span.Name]
 		fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
 			x1, plotY1, x2-x1, PH, color)
 		fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
 			x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
 	}
 	// Chart border
 	fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
 		` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
@@ -380,224 +454,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	return b.String()
 }
 const (
 	ansiRed    = "\033[31m"
 	ansiBlue   = "\033[34m"
 	ansiGreen  = "\033[32m"
 	ansiYellow = "\033[33m"
 	ansiReset  = "\033[0m"
 )
 const (
 	termChartWidth  = 70
 	termChartHeight = 12
 )
 // RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
 // Used in SAT stress-test logs.
 func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 	seen := make(map[int]bool)
 	var order []int
 	gpuMap := make(map[int][]GPUMetricRow)
 	for _, r := range rows {
 		if !seen[r.GPUIndex] {
 			seen[r.GPUIndex] = true
 			order = append(order, r.GPUIndex)
 		}
 		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
 	}
 	type seriesDef struct {
 		caption string
 		color   string
 		fn      func(GPUMetricRow) float64
 	}
 	defs := []seriesDef{
 		{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
 		{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
 		{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
 		{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
 	}
 	var b strings.Builder
 	for _, gpuIdx := range order {
 		gr := gpuMap[gpuIdx]
 		if len(gr) == 0 {
 			continue
 		}
 		tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
 		fmt.Fprintf(&b, "GPU %d — Stress Test Metrics  (%.0f seconds)\n\n", gpuIdx, tMax)
 		for _, d := range defs {
 			b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
 				termChartHeight, termChartWidth))
 			b.WriteRune('\n')
 		}
 	}
 	return strings.TrimRight(b.String(), "\n")
 }
 // renderLineChart draws a single time-series line chart using box-drawing characters.
 // Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
 func renderLineChart(vals []float64, color, caption string, height, width int) string {
 	if len(vals) == 0 {
 		return caption + "\n"
 	}
 	mn, mx := gpuMinMax(vals)
 	if mn == mx {
 		mx = mn + 1
 	}
 	// Use the smaller of width or len(vals) to avoid stretching sparse data.
 	w := width
 	if len(vals) < w {
 		w = len(vals)
 	}
 	data := gpuDownsample(vals, w)
 	// row[i] = display row index: 0 = top = max value, height = bottom = min value.
 	row := make([]int, w)
 	for i, v := range data {
 		r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
 		if r < 0 {
 			r = 0
 		}
 		if r > height {
 			r = height
 		}
 		row[i] = r
 	}
 	// Fill the character grid.
 	grid := make([][]rune, height+1)
 	for i := range grid {
 		grid[i] = make([]rune, w)
 		for j := range grid[i] {
 			grid[i][j] = ' '
 		}
 	}
 	for x := 0; x < w; x++ {
 		r := row[x]
 		if x == 0 {
 			grid[r][0] = '─'
 			continue
 		}
 		p := row[x-1]
 		switch {
 		case r == p:
 			grid[r][x] = '─'
 		case r < p: // value went up (row index decreased toward top)
 			grid[r][x] = '╭'
 			grid[p][x] = '╯'
 			for y := r + 1; y < p; y++ {
 				grid[y][x] = '│'
 			}
 		default: // r > p, value went down
 			grid[p][x] = '╮'
 			grid[r][x] = '╰'
 			for y := p + 1; y < r; y++ {
 				grid[y][x] = '│'
 			}
 		}
 	}
 	// Y axis tick labels.
 	ticks := gpuNiceTicks(mn, mx, height/2)
 	tickAtRow := make(map[int]string)
 	labelWidth := 4
 	for _, t := range ticks {
 		r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
 		if r < 0 || r > height {
 			continue
 		}
 		s := gpuFormatTick(t)
 		tickAtRow[r] = s
 		if len(s) > labelWidth {
 			labelWidth = len(s)
 		}
 	}
 	var b strings.Builder
 	for r := 0; r <= height; r++ {
 		label := tickAtRow[r]
 		fmt.Fprintf(&b, "%*s", labelWidth, label)
 		switch {
 		case label != "":
 			b.WriteRune('┤')
 		case r == height:
 			b.WriteRune('┼')
 		default:
 			b.WriteRune('│')
 		}
 		b.WriteString(color)
 		b.WriteString(string(grid[r]))
 		b.WriteString(ansiReset)
 		b.WriteRune('\n')
 	}
 	// Bottom axis.
 	b.WriteString(strings.Repeat(" ", labelWidth))
 	b.WriteRune('└')
 	b.WriteString(strings.Repeat("─", w))
 	b.WriteRune('\n')
 	// Caption centered under the chart.
 	if caption != "" {
 		total := labelWidth + 1 + w
 		if pad := (total - len(caption)) / 2; pad > 0 {
 			b.WriteString(strings.Repeat(" ", pad))
 		}
 		b.WriteString(caption)
 		b.WriteRune('\n')
 	}
 	return b.String()
 }
 func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
 	v := make([]float64, len(rows))
 	for i, r := range rows {
 		v[i] = fn(r)
 	}
 	return v
 }
 // gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
 func gpuDownsample(vals []float64, w int) []float64 {
 	n := len(vals)
 	if n == 0 {
 		return make([]float64, w)
 	}
 	result := make([]float64, w)
 	if n >= w {
 		counts := make([]int, w)
 		for i, v := range vals {
 			bucket := i * w / n
 			if bucket >= w {
 				bucket = w - 1
 			}
 			result[bucket] += v
 			counts[bucket]++
 		}
 		for i := range result {
 			if counts[i] > 0 {
 				result[i] /= float64(counts[i])
 			}
 		}
 	} else {
 		// Nearest-neighbour upsample.
 		for i := range result {
 			src := i * (n - 1) / (w - 1)
 			if src >= n {
 				src = n - 1
 			}
 			result[i] = vals[src]
 		}
 	}
 	return result
 }
 func gpuMinMax(vals []float64) (float64, float64) {
 	if len(vals) == 0 {
 		return 0, 1
@@ -642,3 +498,57 @@ func gpuFormatTick(v float64) string {
 	}
 	return strconv.FormatFloat(v, 'f', 1, 64)
 }
 var gpuMetricStagePalette = []string{
 	"#d95c5c",
 	"#2185d0",
 	"#21ba45",
 	"#f2c037",
 	"#6435c9",
 	"#00b5ad",
 	"#a5673f",
 }
 func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
 	var spans []gpuMetricStageSpan
 	for _, row := range rows {
 		name := strings.TrimSpace(row.Stage)
 		if name == "" {
 			name = "run"
 		}
 		start := row.StageStartSec
 		end := row.StageEndSec
 		if end <= start {
 			start = row.ElapsedSec
 			end = row.ElapsedSec
 		}
 		if len(spans) == 0 || spans[len(spans)-1].Name != name {
 			spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
 			continue
 		}
 		if start < spans[len(spans)-1].Start {
 			spans[len(spans)-1].Start = start
 		}
 		if end > spans[len(spans)-1].End {
 			spans[len(spans)-1].End = end
 		}
 	}
 	for i := range spans {
 		if spans[i].End <= spans[i].Start {
 			spans[i].End = spans[i].Start + 1
 		}
 	}
 	return spans
 }
 var gpuHTMLReplacer = strings.NewReplacer(
 	"&", "&amp;",
 	"<", "&lt;",
 	">", "&gt;",
 	`"`, "&quot;",
 	"'", "&#39;",
 )
 func gpuHTMLEscape(s string) string {
 	return gpuHTMLReplacer.Replace(s)
 }
--- a/audit/internal/platform/gpu_metrics_test.go
+++ b/audit/internal/platform/gpu_metrics_test.go
@@ -0,0 +1,65 @@
 package platform
 import (
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 )
 func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
 	t.Parallel()
 	dir := t.TempDir()
 	path := filepath.Join(dir, "gpu-metrics.csv")
 	rows := []GPUMetricRow{
 		{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
 	}
 	if err := WriteGPUMetricsCSV(path, rows); err != nil {
 		t.Fatalf("WriteGPUMetricsCSV: %v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("ReadFile: %v", err)
 	}
 	text := string(raw)
 	for _, needle := range []string{
 		"stage,elapsed_sec,gpu_index",
 		`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
 	} {
 		if !strings.Contains(text, needle) {
 			t.Fatalf("csv missing %q\n%s", needle, text)
 		}
 	}
 }
 func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
 	t.Parallel()
 	dir := t.TempDir()
 	path := filepath.Join(dir, "gpu-metrics.html")
 	rows := []GPUMetricRow{
 		{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
 		{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
 		{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
 		{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
 	}
 	if err := WriteGPUMetricsHTML(path, rows); err != nil {
 		t.Fatalf("WriteGPUMetricsHTML: %v", err)
 	}
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		t.Fatalf("ReadFile: %v", err)
 	}
 	text := string(raw)
 	for _, needle := range []string{
 		"stage-legend",
 		"baseline",
 		"steady-fp16",
 		"GPU Stress Test Metrics",
 	} {
 		if !strings.Contains(text, needle) {
 			t.Fatalf("html missing %q\n%s", needle, text)
 		}
 	}
 }
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -11,12 +11,11 @@ import (
 	"strings"
 )
 const installToRAMDir = "/dev/shm/bee-live"
 const copyProgressLogStep int64 = 100 * 1024 * 1024
 func (s *System) IsLiveMediaInRAM() bool {
-	fsType := mountFSType("/run/live/medium")
+	return s.LiveMediaRAMState().InRAM
 	if fsType == "" {
 		return toramActive()
 	}
 	return strings.EqualFold(fsType, "tmpfs")
 }
 func (s *System) LiveBootSource() LiveBootSource {
@@ -48,42 +47,164 @@ func (s *System) LiveBootSource() LiveBootSource {
 	return status
 }
-func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
+func (s *System) LiveMediaRAMState() LiveMediaRAMState {
 	return evaluateLiveMediaRAMState(
 		s.LiveBootSource(),
 		toramActive(),
 		globPaths("/run/live/medium/live/*.squashfs"),
 		globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
 	)
 }
 func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
 	state := LiveMediaRAMState{
 		LiveBootSource: status,
 		ToramActive:    toram,
 		CopyPresent:    len(copiedSquashfs) > 0,
 	}
 	if status.InRAM {
 		state.State = "in_ram"
 		state.Status = "ok"
 		state.CopyComplete = true
 		state.Message = "Running from RAM — installation media can be safely disconnected."
 		return state
 	}
 	expected := pathBaseSet(sourceSquashfs)
 	copied := pathBaseSet(copiedSquashfs)
 	state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
 	switch {
 	case state.CopyComplete:
 		state.State = "partial"
 		state.Status = "partial"
 		state.CanStartCopy = true
 		state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
 	case state.CopyPresent:
 		state.State = "partial"
 		state.Status = "partial"
 		state.CanStartCopy = true
 		state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
 	case toram:
 		state.State = "toram_failed"
 		state.Status = "failed"
 		state.CanStartCopy = true
 		state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
 	default:
 		state.State = "not_in_ram"
 		state.Status = "warning"
 		state.CanStartCopy = true
 		state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
 	}
 	return state
 }
 func globPaths(pattern string) []string {
 	matches, _ := filepath.Glob(pattern)
 	return matches
 }
 func pathBaseSet(paths []string) map[string]struct{} {
 	out := make(map[string]struct{}, len(paths))
 	for _, path := range paths {
 		base := strings.TrimSpace(filepath.Base(path))
 		if base != "" {
 			out[base] = struct{}{}
 		}
 	}
 	return out
 }
 func setContainsAll(have, want map[string]struct{}) bool {
 	if len(want) == 0 {
 		return false
 	}
 	for name := range want {
 		if _, ok := have[name]; !ok {
 			return false
 		}
 	}
 	return true
 }
 func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
 	log := func(msg string) {
 		if logFunc != nil {
 			logFunc(msg)
 		}
 	}
-	if s.IsLiveMediaInRAM() {
+	state := s.LiveMediaRAMState()
 	if state.InRAM {
 		log("Already running from RAM — installation media can be safely disconnected.")
 		return nil
 	}
 	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
-	if err != nil || len(squashfsFiles) == 0 {
+	sourceAvailable := err == nil && len(squashfsFiles) > 0
 		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
 	}
-	free := freeMemBytes()
+	dstDir := installToRAMDir
-	var needed int64
+
-	for _, sf := range squashfsFiles {
+	// If the source medium is unavailable, check whether a previous run already
-		fi, err2 := os.Stat(sf)
+	// produced a complete copy in RAM. If so, skip the copy phase and proceed
-		if err2 != nil {
+	// directly to the loop-rebind / bind-mount steps.
-			return fmt.Errorf("stat %s: %v", sf, err2)
+	if !sourceAvailable {
 		copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
 		if len(copiedFiles) > 0 {
 			log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
 			// Proceed to rebind with the already-copied files.
 			for _, dst := range copiedFiles {
 				base := filepath.Base(dst)
 				// Re-associate the loop device that was originally backed by the
 				// source file (now gone); find it by the old source path pattern.
 				srcGuess := "/run/live/medium/live/" + base
 				loopDev, lerr := findLoopForFile(srcGuess)
 				if lerr != nil {
 					log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
 					continue
 				}
 				if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
 					log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
 				} else {
 					log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
 				}
 			}
 			goto bindMedium
 		}
-		needed += fi.Size()
+		return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
 	}
 	const headroom = 256 * 1024 * 1024
 	if free > 0 && needed+headroom > free {
 		return fmt.Errorf("insufficient RAM: need %s, available %s",
 			humanBytes(needed+headroom), humanBytes(free))
 	}
-	dstDir := "/dev/shm/bee-live"
+	{
 		free := freeMemBytes()
 		var needed int64
 		for _, sf := range squashfsFiles {
 			fi, err2 := os.Stat(sf)
 			if err2 != nil {
 				return fmt.Errorf("stat %s: %v", sf, err2)
 			}
 			needed += fi.Size()
 		}
 		const headroom = 256 * 1024 * 1024
 		if free > 0 && needed+headroom > free {
 			return fmt.Errorf("insufficient RAM: need %s, available %s",
 				humanBytes(needed+headroom), humanBytes(free))
 		}
 	}
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
 	_ = os.RemoveAll(dstDir)
 	if err := os.MkdirAll(dstDir, 0755); err != nil {
 		return fmt.Errorf("create tmpfs dir: %v", err)
 	}
 	defer func() {
 		if retErr == nil {
 			return
 		}
 		_ = os.RemoveAll(dstDir)
 		log("Removed incomplete RAM copy.")
 	}()
 	for _, sf := range squashfsFiles {
 		if err := ctx.Err(); err != nil {
@@ -109,6 +230,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 		}
 	}
 bindMedium:
 	log("Copying remaining medium files...")
 	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
 		log(fmt.Sprintf("Warning: partial copy: %v", err))
@@ -116,14 +238,71 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 	if err := ctx.Err(); err != nil {
 		return err
 	}
-	if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
+
-		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
+	mediumRebound := false
 	if err := bindMount(dstDir, "/run/live/medium"); err != nil {
 		log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
 	} else {
 		mediumRebound = true
 	}
-	log("Done. Installation media can be safely disconnected.")
+	log("Verifying live medium now served from RAM...")
 	status := s.LiveBootSource()
 	if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
 		return err
 	}
 	if status.InRAM {
 		log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
 	}
 	log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
 	return nil
 }
 func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
 	if status.InRAM {
 		return nil
 	}
 	// The live medium mount was not redirected to RAM. This is expected when
 	// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
 	// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
 	// because the CD-ROM mount is in use. Check whether files were at least
 	// copied to the tmpfs directory — that is sufficient for safe disconnection
 	// once the kernel has paged in all actively-used data.
 	files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
 	if len(files) > 0 {
 		if !mediumRebound {
 			log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
 			log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
 		}
 		return nil
 	}
 	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
 }
 func describeLiveBootSource(status LiveBootSource) string {
 	source := strings.TrimSpace(status.Device)
 	if source == "" {
 		source = strings.TrimSpace(status.Source)
 	}
 	if source == "" {
 		source = "unknown source"
 	}
 	switch strings.TrimSpace(status.Kind) {
 	case "ram":
 		return "RAM"
 	case "usb":
 		return "USB (" + source + ")"
 	case "cdrom":
 		return "CD-ROM (" + source + ")"
 	case "disk":
 		return "disk (" + source + ")"
 	default:
 		return source
 	}
 }
 func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
 	in, err := os.Open(src)
 	if err != nil {
@@ -141,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	defer out.Close()
 	total := fi.Size()
 	var copied int64
 	var lastLogged int64
 	buf := make([]byte, 4*1024*1024)
 	for {
 		if err := ctx.Err(); err != nil {
@@ -152,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 				return werr
 			}
 			copied += int64(n)
-			if logFunc != nil && total > 0 {
+			if shouldLogCopyProgress(copied, total, lastLogged) {
 				lastLogged = copied
 				pct := int(float64(copied) / float64(total) * 100)
 				logFunc(fmt.Sprintf("  %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
 			}
@@ -167,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	return out.Sync()
 }
 func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
 	if total <= 0 || copied <= 0 {
 		return false
 	}
 	if copied >= total {
 		return copied > lastLogged
 	}
 	if copied < copyProgressLogStep {
 		return false
 	}
 	return copied-lastLogged >= copyProgressLogStep
 }
 func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
 	return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
 		if ctx.Err() != nil {
@@ -212,7 +406,31 @@ func findLoopForFile(backingFile string) (string, error) {
 	return "", fmt.Errorf("no loop device found for %s", backingFile)
 }
 // loopDeviceOffset returns the byte offset configured for the loop device,
 // or -1 if it cannot be determined.
 func loopDeviceOffset(loopDev string) int64 {
 	out, err := exec.Command("losetup", "--json", loopDev).Output()
 	if err != nil {
 		return -1
 	}
 	var result struct {
 		Loopdevices []struct {
 			Offset int64 `json:"offset"`
 		} `json:"loopdevices"`
 	}
 	if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
 		return -1
 	}
 	return result.Loopdevices[0].Offset
 }
 func reassociateLoopDevice(loopDev, newFile string) error {
 	// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
 	// typically set up with a non-zero offset (squashfs lives inside the ISO),
 	// so the ioctl returns EINVAL. Detect this early for a clear error message.
 	if off := loopDeviceOffset(loopDev); off > 0 {
 		return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
 	}
 	if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
 		return nil
 	}
--- a/audit/internal/platform/install_to_ram_linux.go
+++ b/audit/internal/platform/install_to_ram_linux.go
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
 	}
 	return nil
 }
 // bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
 func bindMount(src, dst string) error {
 	return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
 }
--- a/audit/internal/platform/install_to_ram_other.go
+++ b/audit/internal/platform/install_to_ram_other.go
@@ -7,3 +7,7 @@ import "errors"
 func loopChangeFD(loopDev, newFile string) error {
 	return errors.New("LOOP_CHANGE_FD not available on this platform")
 }
 func bindMount(src, dst string) error {
 	return errors.New("bind mount not available on this platform")
 }
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -3,6 +3,8 @@ package platform
 import "testing"
 func TestInferLiveBootKind(t *testing.T) {
 	t.Parallel()
 	tests := []struct {
 		name       string
 		fsType     string
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
 		{name: "unknown", source: "overlay", want: "unknown"},
 	}
 	for _, tc := range tests {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
 			if got != tc.want {
@@ -26,3 +29,98 @@ func TestInferLiveBootKind(t *testing.T) {
 		})
 	}
 }
 func TestVerifyInstallToRAMStatus(t *testing.T) {
 	t.Parallel()
 	dstDir := t.TempDir()
 	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
 		t.Fatalf("expected success for RAM-backed status, got %v", err)
 	}
 	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
 	if err == nil {
 		t.Fatal("expected verification failure when media is still on USB")
 	}
 	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
 		t.Fatalf("error=%q", got)
 	}
 }
 func TestDescribeLiveBootSource(t *testing.T) {
 	t.Parallel()
 	if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
 		t.Fatalf("got %q want RAM", got)
 	}
 	if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
 		t.Fatalf("got %q want /run/live/medium", got)
 	}
 }
 func TestEvaluateLiveMediaRAMState(t *testing.T) {
 	t.Parallel()
 	t.Run("in_ram", func(t *testing.T) {
 		state := evaluateLiveMediaRAMState(
 			LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
 			false,
 			nil,
 			nil,
 		)
 		if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
 			t.Fatalf("state=%+v", state)
 		}
 	})
 	t.Run("partial_copy_after_cancel", func(t *testing.T) {
 		state := evaluateLiveMediaRAMState(
 			LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
 			false,
 			[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
 			[]string{"/dev/shm/bee-live/filesystem.squashfs"},
 		)
 		if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
 			t.Fatalf("state=%+v", state)
 		}
 		if state.CopyComplete {
 			t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
 		}
 	})
 	t.Run("toram_failed", func(t *testing.T) {
 		state := evaluateLiveMediaRAMState(
 			LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
 			true,
 			nil,
 			nil,
 		)
 		if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
 			t.Fatalf("state=%+v", state)
 		}
 	})
 }
 func TestShouldLogCopyProgress(t *testing.T) {
 	t.Parallel()
 	total := int64(250 * 1024 * 1024)
 	step := int64(100 * 1024 * 1024)
 	if shouldLogCopyProgress(step-1, total, 0) {
 		t.Fatal("progress logged too early")
 	}
 	if !shouldLogCopyProgress(step, total, 0) {
 		t.Fatal("expected log at first 100MB boundary")
 	}
 	if shouldLogCopyProgress(step+16*1024*1024, total, step) {
 		t.Fatal("progress logged again before next 100MB")
 	}
 	if !shouldLogCopyProgress(2*step, total, step) {
 		t.Fatal("expected log at second 100MB boundary")
 	}
 	if !shouldLogCopyProgress(total, total, 2*step) {
 		t.Fatal("expected final completion log")
 	}
 }
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -1,11 +1,14 @@
 package platform
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
 	"time"
 )
 // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
@@ -15,6 +18,11 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
 	"nvbandwidth",
 	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
 	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
 	"nvvs",
 	"dcgmi",
 }
 // KilledProcess describes a process that was sent SIGKILL.
@@ -26,7 +34,12 @@ type KilledProcess struct {
 // KillTestWorkers scans /proc for running test worker processes and sends
 // SIGKILL to each one found. It returns a list of killed processes.
 // Errors for individual processes (e.g. already exited) are silently ignored.
 // The scan runs under a 5-second deadline to avoid blocking if the process
 // table is very large (e.g. after a stress test with thousands of children).
 func KillTestWorkers() []KilledProcess {
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
 	entries, err := os.ReadDir("/proc")
 	if err != nil {
 		return nil
@@ -34,6 +47,13 @@ func KillTestWorkers() []KilledProcess {
 	var killed []KilledProcess
 	for _, e := range entries {
 		select {
 		case <-ctx.Done():
 			slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
 			return killed
 		default:
 		}
 		if !e.IsDir() {
 			continue
 		}
@@ -52,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
 		if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
 			base = exe[idx+1:]
 		}
-		for _, pat := range workerPatterns {
+		if shouldKillWorkerProcess(exe, base) {
-			if strings.Contains(base, pat) || strings.Contains(exe, pat) {
+			_ = syscall.Kill(pid, syscall.SIGKILL)
-				_ = syscall.Kill(pid, syscall.SIGKILL)
+			killed = append(killed, KilledProcess{PID: pid, Name: base})
 				killed = append(killed, KilledProcess{PID: pid, Name: base})
 				break
 			}
 		}
 	}
 	return killed
 }
 func shouldKillWorkerProcess(exe, base string) bool {
 	for _, pat := range workerPatterns {
 		if strings.Contains(base, pat) || strings.Contains(exe, pat) {
 			return true
 		}
 	}
 	return false
 }
--- a/audit/internal/platform/kill_workers_test.go
+++ b/audit/internal/platform/kill_workers_test.go
@@ -0,0 +1,39 @@
 package platform
 import "testing"
 func TestShouldKillWorkerProcess(t *testing.T) {
 	tests := []struct {
 		name string
 		exe  string
 		base string
 		want bool
 	}{
 		{
 			name: "nvbandwidth executable",
 			exe:  "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
 			base: "nvbandwidth",
 			want: true,
 		},
 		{
 			name: "dcgmi executable",
 			exe:  "/usr/bin/dcgmi",
 			base: "dcgmi",
 			want: true,
 		},
 		{
 			name: "unrelated process",
 			exe:  "/usr/bin/bash",
 			base: "bash",
 			want: false,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
 				t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
 			}
 		})
 	}
 }
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -1,8 +1,10 @@
 package platform
 import (
 	"bee/audit/internal/collector"
 	"bufio"
 	"encoding/json"
 	"fmt"
 	"os"
 	"os/exec"
 	"sort"
@@ -14,13 +16,24 @@ import (
 // LiveMetricSample is a single point-in-time snapshot of server metrics
 // collected for the web UI metrics page.
 type LiveMetricSample struct {
-	Timestamp  time.Time      `json:"ts"`
+	Timestamp   time.Time      `json:"ts"`
-	Fans       []FanReading   `json:"fans"`
+	Fans        []FanReading   `json:"fans"`
-	Temps      []TempReading  `json:"temps"`
+	Temps       []TempReading  `json:"temps"`
-	PowerW     float64        `json:"power_w"`
+	PowerW      float64        `json:"power_w"`
-	CPULoadPct float64        `json:"cpu_load_pct"`
+	PowerSource string         `json:"power_source,omitempty"`
-	MemLoadPct float64        `json:"mem_load_pct"`
+	PowerMode   string         `json:"power_mode,omitempty"`
-	GPUs       []GPUMetricRow `json:"gpus"`
+	PowerReason string         `json:"power_reason,omitempty"`
 	PSUs        []PSUReading   `json:"psus,omitempty"`
 	CPULoadPct  float64        `json:"cpu_load_pct"`
 	MemLoadPct  float64        `json:"mem_load_pct"`
 	GPUs        []GPUMetricRow `json:"gpus"`
 }
 // PSUReading is a per-slot power supply input power reading.
 type PSUReading struct {
 	Slot   int     `json:"slot"`
 	Name   string  `json:"name"`
 	PowerW float64 `json:"power_w"`
 }
 // TempReading is a named temperature sensor value.
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
 		}
 	}
-	// System power — returns 0 if unavailable
+	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
-	s.PowerW = sampleSystemPower()
+	s.PSUs = samplePSUPower()
 	// System power: use the global autotune-selected source when configured,
 	// otherwise fall back to the historical heuristic and mark the mode.
 	if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
 		s.PowerW = powerW
 		s.PowerSource = decision.EffectiveSource
 		s.PowerMode = decision.Mode
 		s.PowerReason = decision.Reason
 	}
 	// CPU load — from /proc/stat
 	s.CPULoadPct = sampleCPULoadPct()
@@ -326,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
 	}
 	return chip + " / " + name
 }
 // samplePSUPower reads per-PSU input power via IPMI SDR.
 // Uses collector.PSUSlotsFromSDR (name-based matching) which works across
 // vendors where PSU sensors may not carry entity ID "10.N".
 // Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
 func samplePSUPower() []PSUReading {
 	out, err := exec.Command("ipmitool", "sdr").Output()
 	if err != nil || len(out) == 0 {
 		return nil
 	}
 	slots := collector.PSUSlotsFromSDR(string(out))
 	if len(slots) == 0 {
 		return nil
 	}
 	// Collect slot keys and sort for stable output.
 	keys := make([]int, 0, len(slots))
 	for k := range slots {
 		n, err := strconv.Atoi(k)
 		if err == nil {
 			keys = append(keys, n)
 		}
 	}
 	sort.Ints(keys)
 	psus := make([]PSUReading, 0, len(keys))
 	for _, k := range keys {
 		entry := slots[strconv.Itoa(k)]
 		// Prefer AC input power; fall back to DC output power.
 		var w float64
 		if entry.InputW != nil && *entry.InputW > 0 {
 			w = *entry.InputW
 		} else if entry.OutputW != nil && *entry.OutputW > 0 {
 			w = *entry.OutputW
 		}
 		if w <= 0 {
 			continue
 		}
 		psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
 	}
 	if len(psus) == 0 {
 		return nil
 	}
 	return psus
 }
--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -0,0 +1,51 @@
 package platform
 import (
 	"fmt"
 	"os/exec"
 	"strconv"
 	"strings"
 	"time"
 )
 const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
 func runNvidiaRecover(args ...string) (string, error) {
 	helperArgs := append([]string{nvidiaRecoverHelper}, args...)
 	if _, err := exec.LookPath("systemd-run"); err == nil {
 		unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
 		cmdArgs := []string{
 			"systemd-run",
 			"--quiet",
 			"--pipe",
 			"--wait",
 			"--collect",
 			"--service-type=oneshot",
 			"--unit", unit,
 		}
 		cmdArgs = append(cmdArgs, helperArgs...)
 		raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
 		return string(raw), err
 	}
 	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
 	return string(raw), err
 }
 func resetNvidiaGPU(index int) (string, error) {
 	if index < 0 {
 		return "", fmt.Errorf("gpu index must be >= 0")
 	}
 	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
 	if strings.TrimSpace(out) == "" && err == nil {
 		out = "GPU reset completed.\n"
 	}
 	return out, err
 }
 func restartNvidiaDrivers() (string, error) {
 	out, err := runNvidiaRecover("restart-drivers")
 	if strings.TrimSpace(out) == "" && err == nil {
 		out = "NVIDIA drivers restarted.\n"
 	}
 	return out, err
 }
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+		satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
 		job,
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+	), logFunc)
 }
 func nvidiaStressArchivePrefix(loader string) string {
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"--seconds", strconv.Itoa(opts.DurationSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
 		if opts.StaggerSeconds > 0 && len(selected) > 1 {
 			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
 		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			"bee-john-gpu-stress",
 			"--seconds", strconv.Itoa(opts.DurationSec),
 		}
 		if opts.StaggerSeconds > 0 && len(selected) > 1 {
 			cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
 		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
-				gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
 				if gpuCmd == nil {
 					return
 				}
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
 	}
 	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
-	// Pack tar.gz
+	return runDir, nil
 	archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
 	if err := packPlatformDir(runDir, archivePath); err != nil {
 		return "", fmt.Errorf("pack archive: %w", err)
 	}
 	_ = os.RemoveAll(runDir)
 	return archivePath, nil
 }
 // collectPhase samples live metrics every second until ctx is done.
@@ -392,6 +386,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
 	}
 	cmd := exec.CommandContext(ctx, path, cmdArgs...)
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		}
 		return nil
 	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	if err := startLowPriorityCmd(cmd, 15); err != nil {
@@ -402,28 +403,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
 // Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
-func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
+func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
 	switch strings.ToLower(vendor) {
 	case "amd":
-		return buildAMDGPUStressCmd(ctx)
+		return buildAMDGPUStressCmd(ctx, durSec)
 	case "nvidia":
-		return buildNvidiaGPUStressCmd(ctx)
+		return buildNvidiaGPUStressCmd(ctx, durSec)
 	}
 	return nil
 }
-func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	rvsArgs, err := resolveRVSCommand()
 	if err != nil {
 		return nil
 	}
 	rvsPath := rvsArgs[0]
-	cfg := `actions:
+	cfg := fmt.Sprintf(`actions:
 - name: gst_platform
  device: all
  module: gst
  parallel: true
-  duration: 86400000
+  duration: %d`, durSec*1000) + `
  copy_matrix: false
  target_stress: 90
  matrix_size_a: 8640
@@ -433,13 +434,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
 	cfgFile := "/tmp/bee-platform-gst.conf"
 	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
 	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		}
 		return nil
 	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
 	return cmd
 }
-func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	path, err := satLookPath("bee-gpu-burn")
 	if err != nil {
 		path, err = satLookPath("bee-gpu-stress")
@@ -447,7 +455,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
 	if err != nil {
 		return nil
 	}
-	cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
+	// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
 	// Process group kill via Setpgid+Cancel is kept as a safety net for cases
 	// where the context is cancelled early (user stop, parent timeout).
 	cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	cmd.Cancel = func() error {
 		if cmd.Process != nil {
 			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		}
 		return nil
 	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -1,6 +1,7 @@
 package platform
 import (
 	"bufio"
 	"os"
 	"os/exec"
 	"strings"
@@ -27,6 +28,8 @@ var runtimeTrackedServices = []string{
 	"bee-audit",
 	"bee-web",
 	"bee-sshsetup",
 	"nvidia-dcgm",
 	"nvidia-fabricmanager",
 }
 func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
@@ -114,6 +117,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 	}
 	s.collectGPURuntimeHealth(vendor, &health)
 	s.collectToRAMHealth(&health)
 	s.collectUSBExportHealth(&health)
 	if health.Status != "FAILED" && len(health.Issues) > 0 {
 		health.Status = "PARTIAL"
@@ -135,12 +140,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	case "nvidia":
 		tools = append(tools, s.CheckTools([]string{
 			"nvidia-smi",
 			"dcgmi",
 			"nv-hostengine",
 			"nvidia-bug-report.sh",
 			"bee-gpu-burn",
 			"bee-john-gpu-stress",
 			"bee-nccl-gpu-stress",
 			"all_reduce_perf",
 		})...)
 		tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
 	case "amd":
 		tool := ToolStatus{Name: "rocm-smi"}
 		if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,11 +163,130 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	return tools
 }
 func resolvedToolStatus(display string, candidates ...string) ToolStatus {
 	for _, candidate := range candidates {
 		path, err := exec.LookPath(candidate)
 		if err == nil {
 			return ToolStatus{Name: display, Path: path, OK: true}
 		}
 	}
 	return ToolStatus{Name: display}
 }
 // collectToRAMHealth evaluates whether the live system is fully running from RAM.
 // Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
 // incomplete RAM copy exists but runtime still depends on the boot medium,
 // "failed" = toram was requested but medium is not in RAM.
 func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
 	state := s.LiveMediaRAMState()
 	health.ToRAMStatus = state.Status
 	switch state.Status {
 	case "ok":
 		return
 	case "failed":
 		health.Issues = append(health.Issues, schema.RuntimeIssue{
 			Code:        "toram_copy_failed",
 			Severity:    "warning",
 			Description: state.Message,
 		})
 	case "partial":
 		health.Issues = append(health.Issues, schema.RuntimeIssue{
 			Code:        "toram_copy_partial",
 			Severity:    "warning",
 			Description: state.Message,
 		})
 	}
 }
 // collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
 // suitable for log export. Sets USBExportPath to the first match found.
 func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
 	health.USBExportPath = findUSBExportMount()
 }
 // findUSBExportMount returns the mount point of the first writable USB filesystem
 // found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
 // has USB transport. Returns "" if none found.
 func findUSBExportMount() string {
 	f, err := os.Open("/proc/mounts")
 	if err != nil {
 		return ""
 	}
 	defer f.Close()
 	// fs types that are expected on USB export drives
 	exportFSTypes := map[string]bool{
 		"vfat":    true,
 		"exfat":   true,
 		"ext2":    true,
 		"ext3":    true,
 		"ext4":    true,
 		"ntfs":    true,
 		"ntfs3":   true,
 		"fuseblk": true,
 	}
 	scanner := bufio.NewScanner(f)
 	for scanner.Scan() {
 		// fields: device mountpoint fstype options dump pass
 		fields := strings.Fields(scanner.Text())
 		if len(fields) < 4 {
 			continue
 		}
 		device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
 		if !exportFSTypes[strings.ToLower(fsType)] {
 			continue
 		}
 		// Skip read-only mounts
 		opts := strings.Split(options, ",")
 		readOnly := false
 		for _, o := range opts {
 			if strings.TrimSpace(o) == "ro" {
 				readOnly = true
 				break
 			}
 		}
 		if readOnly {
 			continue
 		}
 		// Check USB transport via lsblk on the device (or its parent disk for partitions).
 		if !strings.HasPrefix(device, "/dev/") {
 			continue
 		}
 		checkDev := device
 		// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
 		// Strip trailing partition digits to get the parent disk name.
 		if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
 			checkDev = trimmed
 		}
 		if blockDeviceTransport(checkDev) == "usb" {
 			return mountPoint
 		}
 	}
 	return ""
 }
 func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
 	lsmodText := commandText("lsmod")
 	switch vendor {
 	case "nvidia":
 		if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
 			health.NvidiaGSPMode = strings.TrimSpace(string(raw))
 			if health.NvidiaGSPMode == "gsp-stuck" {
 				health.Issues = append(health.Issues, schema.RuntimeIssue{
 					Code:        "nvidia_gsp_stuck",
 					Severity:    "critical",
 					Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
 				})
 			} else if health.NvidiaGSPMode == "gsp-off" {
 				health.Issues = append(health.Issues, schema.RuntimeIssue{
 					Code:        "nvidia_gsp_disabled",
 					Severity:    "warning",
 					Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
 				})
 			}
 		}
 		health.DriverReady = strings.Contains(lsmodText, "nvidia ")
 		if !health.DriverReady {
 			health.Issues = append(health.Issues, schema.RuntimeIssue{
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -12,19 +12,68 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
 	"syscall"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"syscall"
 	"time"
 )
 // Estimated wall-clock durations for each SAT/validate test, derived from real
 // production logs in _benchmark/_v8/.
 //
 // Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
 // the corresponding Run*Pack function change, re-measure the wall-clock duration
 // from actual task logs and update the matching constant here.
 //
 // Sources:
 //   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
 //   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
 //   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
 const (
 	// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
 	SATEstimatedCPUValidateSec = 65
 	// CPU stress: stress-ng 1800 s (stress mode default).
 	SATEstimatedCPUStressSec = 1800
 	// RAM: memtester 256 MB / 1 pass.
 	SATEstimatedMemoryValidateSec = 70
 	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
 	SATEstimatedMemoryStressSec = 140
 	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
 	SATEstimatedNvidiaGPUValidateSec = 85
 	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
 	SATEstimatedNvidiaGPUStressSec = 450
 	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
 	SATEstimatedNvidiaTargetedStressSec = 350
 	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
 	SATEstimatedNvidiaTargetedPowerSec = 350
 	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
 	SATEstimatedNvidiaPulseTestSec = 5000
 	// NCCL all_reduce_perf, all GPUs simultaneously.
 	SATEstimatedNvidiaInterconnectSec = 300
 	// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
 	// without a user-configurable time limit; duration is determined by nvbandwidth itself.
 	SATEstimatedNvidiaBandwidthSec = 2700
 )
 var (
-	satExecCommand = exec.Command
+	satExecCommand  = exec.Command
-	satLookPath    = exec.LookPath
+	satLookPath     = exec.LookPath
-	satGlob        = filepath.Glob
+	satGlob         = filepath.Glob
-	satStat        = os.Stat
+	satStat         = os.Stat
 	satFreeMemBytes = freeMemBytes
 	rocmSMIExecutableGlobs = []string{
 		"/opt/rocm/bin/rocm-smi",
@@ -38,6 +87,12 @@ var (
 		"/opt/rocm/bin/rvs",
 		"/opt/rocm-*/bin/rvs",
 	}
 	dcgmProfTesterCandidates = []string{
 		"dcgmproftester",
 		"dcgmproftester13",
 		"dcgmproftester12",
 		"dcgmproftester11",
 	}
 )
 // streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -76,15 +131,46 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
 // NvidiaGPU holds basic GPU info from nvidia-smi.
 type NvidiaGPU struct {
-	Index    int
+	Index    int    `json:"index"`
-	Name     string
+	Name     string `json:"name"`
-	MemoryMB int
+	MemoryMB int    `json:"memory_mb"`
 }
 type NvidiaGPUStatus struct {
 	Index        int    `json:"index"`
 	Name         string `json:"name"`
 	BDF          string `json:"bdf,omitempty"`
 	Serial       string `json:"serial,omitempty"`
 	Status       string `json:"status"`
 	RawLine      string `json:"raw_line,omitempty"`
 	NeedsReset   bool   `json:"needs_reset"`
 	ParseFailure bool   `json:"parse_failure,omitempty"`
 }
 type nvidiaGPUHealth struct {
 	Index        int
 	Name         string
 	NeedsReset   bool
 	RawLine      string
 	ParseFailure bool
 }
 type nvidiaGPUStatusFile struct {
 	Index      int
 	Name       string
 	RunStatus  string
 	Reason     string
 	Health     string
 	HealthRaw  string
 	Observed   bool
 	Selected   bool
 	FailingJob string
 }
 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
 type AMDGPUInfo struct {
-	Index int
+	Index int    `json:"index"`
-	Name  string
+	Name  string `json:"name"`
 }
 // DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
@@ -256,25 +342,206 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
 			MemoryMB: memMB,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool {
 		return gpus[i].Index < gpus[j].Index
 	})
 	return gpus, nil
 }
-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
 	out, err := satExecCommand(
 		"nvidia-smi",
 		"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
 		return nil, fmt.Errorf("nvidia-smi: %w", err)
 	}
 	var gpus []NvidiaGPUStatus
 	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
 		line = strings.TrimSpace(line)
 		if line == "" {
 			continue
 		}
 		parts := strings.Split(line, ",")
 		if len(parts) < 4 {
 			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
 			continue
 		}
 		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
 		if err != nil {
 			gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
 			continue
 		}
 		upper := strings.ToUpper(line)
 		needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
 		status := "OK"
 		if needsReset {
 			status = "RESET_REQUIRED"
 		}
 		gpus = append(gpus, NvidiaGPUStatus{
 			Index:      idx,
 			Name:       strings.TrimSpace(parts[1]),
 			BDF:        normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
 			Serial:     strings.TrimSpace(parts[3]),
 			Status:     status,
 			RawLine:    line,
 			NeedsReset: needsReset,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
 	return gpus, nil
 }
 func normalizeNvidiaBusID(v string) string {
 	v = strings.TrimSpace(strings.ToLower(v))
 	parts := strings.Split(v, ":")
 	if len(parts) == 3 && len(parts[0]) > 4 {
 		parts[0] = parts[0][len(parts[0])-4:]
 		return strings.Join(parts, ":")
 	}
 	return v
 }
 func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	return resetNvidiaGPU(index)
 }
 // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
-	// detect GPU count
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
+	if err != nil {
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+		return "", err
 	}
 	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
+	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-all-reduce-perf.log", cmd: []string{
+		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
-	}, logFunc)
+	), logFunc)
 }
 func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	var (
 		profCmd []string
 		profEnv []string
 	)
 	if len(selected) > 1 {
 		// For multiple GPUs, always spawn one dcgmproftester process per GPU via
 		// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
 		// A single dcgmproftester process without -i only loads GPU 0 regardless
 		// of CUDA_VISIBLE_DEVICES.
 		stagger := staggerSec
 		if stagger < 0 {
 			stagger = 0
 		}
 		profCmd = []string{
 			"bee-dcgmproftester-staggered",
 			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
 			"--stagger-seconds", strconv.Itoa(stagger),
 			"--devices", joinIndexList(selected),
 		}
 	} else {
 		profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
 		if err != nil {
 			return "", err
 		}
 		profEnv = nvidiaVisibleDevicesEnv(selected)
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
 		satJob{
 			name:       "03-dcgmproftester.log",
 			cmd:        profCmd,
 			env:        profEnv,
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-targeted-power.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-pulse-test.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-nvbandwidth.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
@@ -293,6 +560,30 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
 }
 func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
 	selected, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
 			name:       "02-dcgmi-targeted-stress.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
 		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	if len(gpuIndices) > 0 {
 		return dedupeSortedIndices(gpuIndices), nil
@@ -307,12 +598,45 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	return all, nil
 }
-func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func memoryStressSizeArg() string {
-	sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
+	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
-	passes := envInt("BEE_MEMTESTER_PASSES", 1)
+		return fmt.Sprintf("%dM", mb)
 	}
 	availBytes := satFreeMemBytes()
 	if availBytes <= 0 {
 		return "80%"
 	}
 	availMB := availBytes / (1024 * 1024)
 	targetMB := (availMB * 2) / 3
 	if targetMB >= 256 {
 		targetMB = (targetMB / 256) * 256
 	}
 	if targetMB <= 0 {
 		return "80%"
 	}
 	return fmt.Sprintf("%dM", targetMB)
 }
 func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
 	if sizeMB <= 0 {
 		sizeMB = 256
 	}
 	if passes <= 0 {
 		passes = 1
 	}
 	// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
 	// intentionally conservative enough for healthy systems while avoiding the
 	// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
 	timeoutSec := sizeMB*passes*20/100 + 60
 	if timeoutSec < 180 {
 		timeoutSec = 180
 	}
 	if timeoutSec > 900 {
 		timeoutSec = 900
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
-		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
+		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
 		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
 	}, logFunc)
 }
@@ -322,11 +646,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
 	if seconds <= 0 {
 		seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
 	}
-	// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
+	// Base the default on current MemAvailable and keep headroom for the OS and
-	sizeArg := "80%"
+	// concurrent stressors so mixed burn runs do not trip the OOM killer.
-	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
+	sizeArg := memoryStressSizeArg()
 		sizeArg = fmt.Sprintf("%dM", mb)
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-stress-ng-vm.log", cmd: []string{
@@ -368,7 +690,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
 	}, logFunc)
 }
-func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 	if baseDir == "" {
 		baseDir = "/var/log/bee-sat"
 	}
@@ -400,7 +722,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
 			break
 		}
 		prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
-		commands := storageSATCommands(devPath)
+		commands := storageSATCommands(devPath, extended)
 		for cmdIndex, job := range commands {
 			if ctx.Err() != nil {
 				break
@@ -422,11 +744,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 type satJob struct {
@@ -443,14 +761,24 @@ type satStats struct {
 	Unsupported int
 }
 func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
 	out := make([]satJob, 0, len(jobs)+1)
 	out = append(out, satJob{
 		name: "00-nvidia-smi-persistence-mode.log",
 		cmd:  []string{"nvidia-smi", "-pm", "1"},
 	})
 	out = append(out, jobs...)
 	return out
 }
 func nvidiaSATJobs() []satJob {
-	return []satJob{
+	return withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
+		satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
-		{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
+		satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
-	}
+	)
 }
 func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
@@ -465,11 +793,39 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 		}
 		diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
 	}
-	return []satJob{
+	return withNvidiaPersistenceMode(
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-dcgmi-diag.log", cmd: diagArgs},
+		satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
 	)
 }
 func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
 	args := []string{"dcgmi", "diag", "-r", name}
 	if durationSec > 0 {
 		args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
 	}
 	if len(gpuIndices) > 0 {
 		args = append(args, "-i", joinIndexList(gpuIndices))
 	}
 	return args
 }
 func normalizeNvidiaBurnDuration(durationSec int) int {
 	if durationSec <= 0 {
 		return 300
 	}
 	return durationSec
 }
 func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
 	if len(gpuIndices) == 0 {
 		return nil
 	}
 	return []string{
 		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
 		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
 	}
 }
@@ -489,11 +845,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	var summary strings.Builder
 	stats := satStats{}
 	nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
 	perGPU := map[int]*nvidiaGPUStatusFile{}
 	selectedGPUIndices := map[int]struct{}{}
 	fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	for _, job := range jobs {
 		if ctx.Err() != nil {
 			break
 		}
 		for _, idx := range job.gpuIndices {
 			selectedGPUIndices[idx] = struct{}{}
 			status := perGPU[idx]
 			if status == nil {
 				status = &nvidiaGPUStatusFile{Index: idx}
 				perGPU[idx] = status
 			}
 			status.Selected = true
 		}
 		cmd := make([]string, 0, len(job.cmd))
 		for _, arg := range job.cmd {
 			cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
@@ -502,17 +870,52 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		var out []byte
 		var err error
-		if job.collectGPU {
+		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
-			out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
+			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
-		} else {
+				if logFunc != nil {
-			out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
+					logFunc(msg)
 				}
 				out = []byte(msg + "\n")
 				err = healthErr
 			}
 		}
 		if err == nil {
 			if job.collectGPU {
 				out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
 			} else {
 				out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
 			}
 		}
 		if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
 			if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
 				if logFunc != nil {
 					logFunc(msg)
 				}
 				if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
 					out = append(out, '\n')
 				}
 				out = append(out, []byte(msg+"\n")...)
 				if err == nil {
 					err = healthErr
 				}
 			}
 		}
 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
 			return "", writeErr
 		}
 		if ctx.Err() != nil {
 			return "", ctx.Err()
 		}
 		status, rc := classifySATResult(job.name, out, err)
 		stats.Add(status)
 		if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
 			for _, idx := range job.gpuIndices {
 				updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
 			}
 		}
 		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
 		fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
 		fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
@@ -521,12 +924,204 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
 		return "", err
 	}
-
+	if nvidiaPack {
-	archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
+		if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
-	if err := createTarGz(archive, runDir); err != nil {
+			return "", err
-		return "", err
+		}
 	}
-	return archive, nil
+
 	return runDir, nil
 }
 func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
 	entry := perGPU[idx]
 	if entry == nil {
 		entry = &nvidiaGPUStatusFile{Index: idx}
 		perGPU[idx] = entry
 	}
 	if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
 		entry.RunStatus = status
 		entry.FailingJob = jobName
 		entry.Reason = firstLine(detail)
 	}
 }
 func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
 	health, err := readNvidiaGPUHealth()
 	if err == nil {
 		for _, gpu := range health {
 			entry := perGPU[gpu.Index]
 			if entry == nil {
 				entry = &nvidiaGPUStatusFile{Index: gpu.Index}
 				perGPU[gpu.Index] = entry
 			}
 			entry.Name = gpu.Name
 			entry.Observed = true
 			entry.HealthRaw = gpu.RawLine
 			if gpu.NeedsReset {
 				entry.Health = "RESET_REQUIRED"
 				if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
 					entry.RunStatus = "FAILED"
 					if strings.TrimSpace(entry.Reason) == "" {
 						entry.Reason = "GPU requires reset"
 					}
 				}
 			} else {
 				entry.Health = "OK"
 			}
 		}
 	}
 	for idx := range selected {
 		entry := perGPU[idx]
 		if entry == nil {
 			entry = &nvidiaGPUStatusFile{Index: idx}
 			perGPU[idx] = entry
 		}
 		entry.Selected = true
 	}
 	var indices []int
 	for idx := range perGPU {
 		indices = append(indices, idx)
 	}
 	sort.Ints(indices)
 	for _, idx := range indices {
 		entry := perGPU[idx]
 		if entry.RunStatus == "" {
 			entry.RunStatus = overall
 		}
 		if entry.Health == "" {
 			entry.Health = "UNKNOWN"
 		}
 		if entry.Name == "" {
 			entry.Name = "Unknown GPU"
 		}
 		var body strings.Builder
 		fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
 		fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
 		fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
 		fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
 		fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
 		fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
 		if strings.TrimSpace(entry.FailingJob) != "" {
 			fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
 		}
 		if strings.TrimSpace(entry.Reason) != "" {
 			fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
 		}
 		if strings.TrimSpace(entry.HealthRaw) != "" {
 			fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
 		}
 		if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 func nvidiaSATStatusSeverity(status string) int {
 	switch strings.ToUpper(strings.TrimSpace(status)) {
 	case "FAILED":
 		return 3
 	case "PARTIAL", "UNSUPPORTED":
 		return 2
 	case "OK":
 		return 1
 	default:
 		return 0
 	}
 }
 func firstLine(s string) string {
 	s = strings.TrimSpace(s)
 	if s == "" {
 		return ""
 	}
 	if idx := strings.IndexByte(s, '\n'); idx >= 0 {
 		return strings.TrimSpace(s[:idx])
 	}
 	return s
 }
 func nvidiaJobNeedsHealthCheck(job satJob) bool {
 	if job.collectGPU {
 		return true
 	}
 	name := strings.ToLower(strings.TrimSpace(job.name))
 	return strings.Contains(name, "dcgmi") ||
 		strings.Contains(name, "gpu-burn") ||
 		strings.Contains(name, "gpu-stress") ||
 		strings.Contains(name, "dcgmproftester")
 }
 func checkNvidiaJobHealth(selected []int) (string, error) {
 	health, err := readNvidiaGPUHealth()
 	if err != nil {
 		return "", nil
 	}
 	var bad []nvidiaGPUHealth
 	selectedSet := make(map[int]struct{}, len(selected))
 	for _, idx := range selected {
 		selectedSet[idx] = struct{}{}
 	}
 	for _, gpu := range health {
 		if len(selectedSet) > 0 {
 			if _, ok := selectedSet[gpu.Index]; !ok {
 				continue
 			}
 		}
 		if gpu.NeedsReset {
 			bad = append(bad, gpu)
 		}
 	}
 	if len(bad) == 0 {
 		return "", nil
 	}
 	lines := make([]string, 0, len(bad)+1)
 	lines = append(lines, "NVIDIA GPU health check failed:")
 	for _, gpu := range bad {
 		lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
 	}
 	return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
 }
 func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
 	out, err := satExecCommand(
 		"nvidia-smi",
 		"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
 		"--format=csv,noheader,nounits",
 	).Output()
 	if err != nil {
 		return nil, fmt.Errorf("nvidia-smi: %w", err)
 	}
 	return parseNvidiaGPUHealth(string(out)), nil
 }
 func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
 	var gpus []nvidiaGPUHealth
 	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
 		line = strings.TrimSpace(line)
 		if line == "" {
 			continue
 		}
 		parts := strings.Split(line, ",")
 		if len(parts) < 2 {
 			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
 			continue
 		}
 		idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
 		if err != nil {
 			gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
 			continue
 		}
 		upper := strings.ToUpper(line)
 		gpus = append(gpus, nvidiaGPUHealth{
 			Index:      idx,
 			Name:       strings.TrimSpace(parts[1]),
 			NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
 			RawLine:    line,
 		})
 	}
 	return gpus
 }
 func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
@@ -583,17 +1178,25 @@ func listStorageDevices() ([]string, error) {
 	return parseStorageDevices(string(out)), nil
 }
-func storageSATCommands(devPath string) []satJob {
+func storageSATCommands(devPath string, extended bool) []satJob {
 	if strings.Contains(filepath.Base(devPath), "nvme") {
 		selfTestLevel := "1"
 		if extended {
 			selfTestLevel = "2"
 		}
 		return []satJob{
 			{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
 			{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
-			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
+			{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
 		}
 	}
 	smartTestType := "short"
 	if extended {
 		smartTestType = "long"
 	}
 	return []satJob{
 		{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
-		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
+		{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
 	}
 }
@@ -642,6 +1245,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 	}
 	if strings.Contains(text, "unsupported") ||
 		strings.Contains(text, "not supported") ||
 		strings.Contains(text, "not found in path") ||
 		strings.Contains(text, "invalid opcode") ||
 		strings.Contains(text, "unknown command") ||
 		strings.Contains(text, "not implemented") ||
@@ -651,6 +1255,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 		// nvidia-smi on a machine with no NVIDIA GPU
 		strings.Contains(text, "couldn't communicate with the nvidia driver") ||
 		strings.Contains(text, "no nvidia gpu") ||
 		// Some NVMe firmwares start self-test but never expose progress to nvme-cli
 		// while waiting, so the CLI stops polling without proving device failure.
 		(strings.Contains(name, "self-test") &&
 			strings.Contains(text, "no progress for") &&
 			strings.Contains(text, "stop waiting")) ||
 		(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
 		return "UNSUPPORTED", rc
 	}
@@ -748,6 +1357,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
 	return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
 }
 func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
 	for _, candidate := range dcgmProfTesterCandidates {
 		if path, err := satLookPath(candidate); err == nil {
 			return append([]string{path}, args...), nil
 		}
 	}
 	return nil, errors.New("dcgmproftester not found in PATH")
 }
 func ensureAMDRuntimeReady() error {
 	if _, err := os.Stat("/dev/kfd"); err == nil {
 		return nil
@@ -846,8 +1464,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
 	if len(metricRows) > 0 {
 		_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
 		_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
 		chart := RenderGPUTerminalChart(metricRows)
 		_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
 	}
 	return out, err
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -20,7 +21,7 @@ type FanStressOptions struct {
 	Phase1DurSec int   // first load phase duration in seconds (default 300)
 	PauseSec     int   // pause between the two load phases (default 60)
 	Phase2DurSec int   // second load phase duration in seconds (default 300)
-	SizeMB       int   // GPU memory to allocate per GPU during stress (default 64)
+	SizeMB       int   // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
 	GPUIndices   []int // which GPU indices to stress (empty = all detected)
 }
@@ -42,27 +43,56 @@ type GPUStressMetric struct {
 // FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
 type FanStressRow struct {
-	TimestampUTC string
+	TimestampUTC   string
-	ElapsedSec   float64
+	ElapsedSec     float64
-	Phase        string // "baseline", "load1", "pause", "load2", "cooldown"
+	Phase          string // "baseline", "load1", "pause", "load2", "cooldown"
-	GPUs         []GPUStressMetric
+	GPUs           []GPUStressMetric
-	Fans         []FanReading
+	Fans           []FanReading
-	CPUMaxTempC  float64 // highest CPU temperature from ipmitool / sensors
+	CPUMaxTempC    float64 // highest CPU temperature from ipmitool / sensors
-	SysPowerW    float64 // DCMI system power reading
+	SysPowerW      float64
 	SysPowerSource string
 	SysPowerMode   string
 }
 type cachedPowerReading struct {
 	Value     float64
 	Source    string
 	Mode      string
 	Reason    string
 	UpdatedAt time.Time
 }
 type fanObservationState struct {
 	MaxRPM map[string]float64 `json:"max_rpm"`
 }
 type fanPeakCandidate struct {
 	FirstSeen time.Time
 	RPM       float64
 }
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
 	fanObservationMu   sync.Mutex
 	fanObservation     fanObservationState
 	fanObservationInit bool
 	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )
 const systemPowerHoldTTL = 15 * time.Second
 var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
 const fanObservationMinPeakHold = time.Second
 func normalizeObservedFanMaxRPM(rpm float64) float64 {
 	if rpm <= 0 {
 		return 0
 	}
 	return math.Ceil(rpm/1000.0) * 1000.0
 }
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -223,11 +253,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		return "", err
 	}
-	archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
+	return runDir, nil
 	if err := createTarGz(archive, runDir); err != nil {
 		return "", err
 	}
 	return archive, nil
 }
 func applyFanStressDefaults(opts *FanStressOptions) {
@@ -243,9 +269,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
 	if opts.Phase2DurSec <= 0 {
 		opts.Phase2DurSec = 300
 	}
-	if opts.SizeMB <= 0 {
+	// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
-		opts.SizeMB = 64
+	// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
 	}
 }
 // sampleFanStressRow collects all metrics for one telemetry sample.
@@ -258,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
 	row.GPUs = sampleGPUStressMetrics(gpuIndices)
 	row.Fans, _ = sampleFanSpeeds()
 	row.CPUMaxTempC = sampleCPUMaxTemp()
-	row.SysPowerW = sampleSystemPower()
+	row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
 	return row
 }
@@ -315,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
 			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
 		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -328,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }
 func loadFanObservationLocked() {
 	if fanObservationInit {
 		return
 	}
 	fanObservationInit = true
 	fanObservation.MaxRPM = make(map[string]float64)
 	raw, err := os.ReadFile(fanObservationStatePath)
 	if err != nil || len(raw) == 0 {
 		return
 	}
 	var persisted fanObservationState
 	if json.Unmarshal(raw, &persisted) != nil {
 		return
 	}
 	for name, rpm := range persisted.MaxRPM {
 		name = strings.TrimSpace(name)
 		if name == "" || rpm <= 0 {
 			continue
 		}
 		fanObservation.MaxRPM[name] = rpm
 	}
 }
 func saveFanObservationLocked() {
 	if len(fanObservation.MaxRPM) == 0 {
 		return
 	}
 	dir := filepath.Dir(fanObservationStatePath)
 	if dir == "" || dir == "." {
 		dir = "/var/log/bee-sat"
 	}
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		return
 	}
 	raw, err := json.MarshalIndent(fanObservation, "", "  ")
 	if err != nil {
 		return
 	}
 	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
 }
 func updateFanObservation(fans []FanReading, now time.Time) {
 	if len(fans) == 0 {
 		return
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	changed := false
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		currentMax := fanObservation.MaxRPM[name]
 		if fan.RPM <= currentMax {
 			delete(fanPeakCandidates, name)
 			continue
 		}
 		if cand, ok := fanPeakCandidates[name]; ok {
 			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
 				newMax := math.Max(cand.RPM, fan.RPM)
 				if newMax > currentMax {
 					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
 					changed = true
 				}
 				delete(fanPeakCandidates, name)
 				continue
 			}
 			if fan.RPM > cand.RPM {
 				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
 			}
 			continue
 		}
 		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
 	}
 	if changed {
 		saveFanObservationLocked()
 	}
 }
 func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
 	if len(fans) == 0 {
 		return 0, false
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	var samples []float64
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		maxRPM := fanObservation.MaxRPM[name]
 		if maxRPM <= 0 {
 			continue
 		}
 		pct := fan.RPM / maxRPM * 100.0
 		if pct > 100 {
 			pct = 100
 		}
 		if pct < 0 {
 			pct = 0
 		}
 		samples = append(samples, pct)
 	}
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -431,6 +571,116 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
 	return fans, nil
 }
 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
 func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
 		fans, fanErr := sampleFanSpeeds()
 		if fanErr != nil {
 			return 0, false, false
 		}
 		return sampleFanDutyCyclePctFromFans(fans)
 	}
 	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
 	return pct, ok, false
 }
 func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
 	if len(fans) == 0 {
 		return 0, false, false
 	}
 	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
 		return pct, true, true
 	}
 	return 0, false, false
 }
 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
 	var doc map[string]map[string]any
 	if err := json.Unmarshal(raw, &doc); err != nil {
 		return 0, false
 	}
 	var samples []float64
 	for _, features := range doc {
 		for name, feature := range features {
 			if strings.EqualFold(name, "Adapter") {
 				continue
 			}
 			featureMap, ok := feature.(map[string]any)
 			if !ok {
 				continue
 			}
 			if duty, ok := firstFanDutyValue(name, featureMap); ok {
 				samples = append(samples, duty)
 			}
 		}
 	}
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
 	featureName = strings.ToLower(strings.TrimSpace(featureName))
 	if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
 		return 0, false
 	}
 	if strings.Contains(featureName, "pwm") {
 		for _, key := range []string{"input", "value", "current"} {
 			if value, ok := feature[key]; ok {
 				if duty, parsed := parseFanDutyValue(value); parsed {
 					return duty, true
 				}
 			}
 		}
 	}
 	keys := make([]string, 0, len(feature))
 	for key := range feature {
 		keys = append(keys, key)
 	}
 	sort.Strings(keys)
 	for _, key := range keys {
 		lower := strings.ToLower(key)
 		if !strings.Contains(lower, "pwm") {
 			continue
 		}
 		if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
 			continue
 		}
 		if duty, parsed := parseFanDutyValue(feature[key]); parsed {
 			return duty, true
 		}
 	}
 	return 0, false
 }
 func parseFanDutyValue(value any) (float64, bool) {
 	switch v := value.(type) {
 	case float64:
 		return normalizePWMAsDutyPct(v)
 	case string:
 		if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
 			return normalizePWMAsDutyPct(f)
 		}
 	}
 	return 0, false
 }
 func normalizePWMAsDutyPct(raw float64) (float64, bool) {
 	if raw < 0 {
 		return 0, false
 	}
 	if raw <= 100 {
 		return raw, true
 	}
 	if raw <= 255 {
 		return raw / 255.0 * 100.0, true
 	}
 	return 0, false
 }
 func firstFanInputValue(feature map[string]any) (float64, bool) {
 	keys := make([]string, 0, len(feature))
 	for key := range feature {
@@ -518,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
 	return max
 }
-// sampleSystemPower reads system power draw via DCMI.
+// sampleSystemPowerResolved reads system power via the global autotune source,
-func sampleSystemPower() float64 {
+// falling back to the historical heuristic before autotune or when degraded.
 func sampleSystemPowerResolved() (float64, string, string) {
 	now := time.Now()
-	current := 0.0
+	current, decision, err := SampleSystemPowerResolved("")
 	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
 	if err == nil {
 		current = parseDCMIPowerReading(string(out))
 	}
 	systemPowerCacheMu.Lock()
 	defer systemPowerCacheMu.Unlock()
-	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	if err != nil {
 		current = 0
 	}
 	value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
 	systemPowerCache = updated
-	return value
+	return value, updated.Source, updated.Mode
 }
 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -553,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }
-func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
 	if current > 0 {
-		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
 		return current, cache
 	}
 	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform
 import (
 	"path/filepath"
 	"testing"
 	"time"
 )
@@ -29,6 +30,74 @@ func TestFirstFanInputValue(t *testing.T) {
 	}
 }
 func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	raw := []byte(`{
 		"chip0": {
 			"fan1": {"input": 9000},
 			"pwm1": {"input": 128},
 			"pwm1_enable": {"input": 1}
 		},
 		"chip1": {
 			"pwm2": {"input": 64}
 		}
 	}`)
 	got, ok := parseFanDutyCyclePctSensorsJSON(raw)
 	if !ok {
 		t.Fatalf("expected duty cycle telemetry to be parsed")
 	}
 	if got < 57 || got > 58 {
 		t.Fatalf("got=%v want ~57.1", got)
 	}
 }
 func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
 	t.Parallel()
 	oldPath := fanObservationStatePath
 	oldState := fanObservation
 	oldInit := fanObservationInit
 	oldCandidates := fanPeakCandidates
 	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	t.Cleanup(func() {
 		fanObservationStatePath = oldPath
 		fanObservation = oldState
 		fanObservationInit = oldInit
 		fanPeakCandidates = oldCandidates
 	})
 	start := time.Unix(100, 0)
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
 	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
 		t.Fatalf("single-sample spike should not establish observed max")
 	}
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
 	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected estimated duty cycle from persisted observed max")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("got=%v want ~43.3", got)
 	}
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected persisted observed max to be reloaded from disk")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("reloaded got=%v want ~43.3", got)
 	}
 }
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
@@ -43,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	now := time.Now()
 	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
-	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
 	if got != 480 {
 		t.Fatalf("got=%v want cached 480", got)
 	}
@@ -51,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 		t.Fatalf("updated=%+v", updated)
 	}
-	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
 	if got != 530 {
 		t.Fatalf("got=%v want 530", got)
 	}
@@ -60,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	}
 	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
-	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
 	if got != 0 {
 		t.Fatalf("expired cache returned %v want 0", got)
 	}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -1,23 +1,25 @@
 package platform
 import (
 	"context"
 	"errors"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 )
 func TestStorageSATCommands(t *testing.T) {
 	t.Parallel()
-	nvme := storageSATCommands("/dev/nvme0n1")
+	nvme := storageSATCommands("/dev/nvme0n1", false)
 	if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
 		t.Fatalf("unexpected nvme commands: %#v", nvme)
 	}
-	sata := storageSATCommands("/dev/sda")
+	sata := storageSATCommands("/dev/sda", false)
 	if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
 		t.Fatalf("unexpected sata commands: %#v", sata)
 	}
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	if len(jobs) != 5 {
+	if len(jobs) != 6 {
-		t.Fatalf("jobs=%d want 5", len(jobs))
+		t.Fatalf("jobs=%d want 6", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
+	if got := jobs[0].cmd[0]; got != "nvidia-smi" {
 		t.Fatalf("preflight command=%q want nvidia-smi", got)
 	}
 	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
 		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
 	}
 	if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
 		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
 	}
-	if got := jobs[3].cmd[1]; got != "--output-file" {
+	if got := jobs[4].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
 	}
 }
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
 func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	got := jobs[4].cmd
+	got := jobs[5].cmd
 	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	}
 }
 func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
 	jobs := nvidiaDCGMJobs(3, []int{2, 0})
 	if len(jobs) != 5 {
 		t.Fatalf("jobs=%d want 5", len(jobs))
 	}
 	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
 		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
 	}
 	if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
 		t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
 	}
 }
 func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
 	t.Parallel()
@@ -195,6 +216,137 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	}
 }
 func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
 	t.Parallel()
 	got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
 	if len(got) != 2 {
 		t.Fatalf("len=%d want 2", len(got))
 	}
 	if got[0].NeedsReset {
 		t.Fatalf("gpu0 unexpectedly marked reset-required")
 	}
 	if !got[1].NeedsReset {
 		t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
 	}
 }
 func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	msg, err := checkNvidiaJobHealth([]int{1})
 	if err == nil {
 		t.Fatal("expected health check error")
 	}
 	if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
 		t.Fatalf("unexpected message: %q", msg)
 	}
 }
 func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
 	dir := t.TempDir()
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	perGPU := map[int]*nvidiaGPUStatusFile{
 		0: {Index: 0, RunStatus: "OK"},
 		1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
 	}
 	if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
 		t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
 	}
 	raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
 	if err != nil {
 		t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
 	}
 	text := string(raw)
 	if !strings.Contains(text, "run_status=FAILED") {
 		t.Fatalf("missing run status:\n%s", text)
 	}
 	if !strings.Contains(text, "health_status=RESET_REQUIRED") {
 		t.Fatalf("missing health status:\n%s", text)
 	}
 	if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
 		t.Fatalf("missing failing job:\n%s", text)
 	}
 }
 func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
 	oldLookPath := satLookPath
 	satLookPath = func(file string) (string, error) {
 		switch file {
 		case "dcgmproftester13":
 			return "/usr/bin/dcgmproftester13", nil
 		default:
 			return "", exec.ErrNotFound
 		}
 	}
 	t.Cleanup(func() { satLookPath = oldLookPath })
 	cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
 	if err != nil {
 		t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
 	}
 	if len(cmd) != 4 {
 		t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
 	}
 	if cmd[0] != "/usr/bin/dcgmproftester13" {
 		t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
 	}
 }
 func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
 	want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
 	if len(cmd) != len(want) {
 		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
 	}
 	for i := range want {
 		if cmd[i] != want[i] {
 			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
 		}
 	}
 }
 func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
 	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
 	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
 	if len(cmd) != len(want) {
 		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
 	}
 	for i := range want {
 		if cmd[i] != want[i] {
 			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
 		}
 	}
 }
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
 		t.Fatalf("env len=%d want 2 (%v)", len(env), env)
 	}
 	if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
 		t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
 	}
 	if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
 		t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
 	}
 }
 func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
 	t.Parallel()
@@ -229,6 +381,37 @@ func TestEnvIntFallback(t *testing.T) {
 	}
 }
 func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
 	oldFreeMemBytes := satFreeMemBytes
 	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
 	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
 	if got := memoryStressSizeArg(); got != "65536M" {
 		t.Fatalf("sizeArg=%q want 65536M", got)
 	}
 }
 func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
 	oldFreeMemBytes := satFreeMemBytes
 	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
 	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
 	t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
 	if got := memoryStressSizeArg(); got != "4096M" {
 		t.Fatalf("sizeArg=%q want 4096M", got)
 	}
 }
 func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
 	oldFreeMemBytes := satFreeMemBytes
 	satFreeMemBytes = func() int64 { return 0 }
 	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
 	if got := memoryStressSizeArg(); got != "80%" {
 		t.Fatalf("sizeArg=%q want 80%%", got)
 	}
 }
 func TestClassifySATResult(t *testing.T) {
 	tests := []struct {
 		name   string
@@ -239,6 +422,7 @@ func TestClassifySATResult(t *testing.T) {
 	}{
 		{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
 		{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 		{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 		{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
 		{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 	}
@@ -253,6 +437,38 @@ func TestClassifySATResult(t *testing.T) {
 	}
 }
 func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
 	dir := t.TempDir()
 	ctx, cancel := context.WithCancel(context.Background())
 	t.Cleanup(cancel)
 	done := make(chan struct{})
 	go func() {
 		time.Sleep(100 * time.Millisecond)
 		cancel()
 		close(done)
 	}()
 	archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
 		{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
 	}, nil)
 	<-done
 	if !errors.Is(err, context.Canceled) {
 		t.Fatalf("err=%v want context.Canceled", err)
 	}
 	if archive != "" {
 		t.Fatalf("archive=%q want empty", archive)
 	}
 	matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
 	if globErr != nil {
 		t.Fatalf("Glob error: %v", globErr)
 	}
 	if len(matches) != 0 {
 		t.Fatalf("archives=%v want none", matches)
 	}
 }
 func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -10,17 +10,30 @@ import (
 func (s *System) ListBeeServices() ([]string, error) {
 	seen := map[string]bool{}
 	var out []string
-	for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
+	for _, pattern := range []string{
 		"/etc/systemd/system/bee-*.service",
 		"/lib/systemd/system/bee-*.service",
 		"/etc/systemd/system/bee-*.timer",
 		"/lib/systemd/system/bee-*.timer",
 	} {
 		matches, err := filepath.Glob(pattern)
 		if err != nil {
 			return nil, err
 		}
 		for _, match := range matches {
-			name := strings.TrimSuffix(filepath.Base(match), ".service")
+			base := filepath.Base(match)
 			name := base
 			if strings.HasSuffix(base, ".service") {
 				name = strings.TrimSuffix(base, ".service")
 			}
 			// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
 			if strings.HasSuffix(name, "@") {
 				continue
 			}
 			// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
 			if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
 				continue
 			}
 			if !seen[name] {
 				seen[name] = true
 				out = append(out, name)
@@ -48,7 +61,12 @@ func (s *System) ServiceState(name string) string {
 }
 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
-	raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
+	if name == "bee-nvidia" && action == ServiceRestart {
 		return restartNvidiaDrivers()
 	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
 	return string(raw), err
 }
--- a/audit/internal/platform/techdump.go
+++ b/audit/internal/platform/techdump.go
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
 	{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
 	{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
 	{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
 	{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
 	{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
 	{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
 	{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -9,6 +9,17 @@ type LiveBootSource struct {
 	Device string `json:"device,omitempty"`
 }
 type LiveMediaRAMState struct {
 	LiveBootSource
 	State        string `json:"state"`
 	Status       string `json:"status"`
 	ToramActive  bool   `json:"toram_active,omitempty"`
 	CopyPresent  bool   `json:"copy_present,omitempty"`
 	CopyComplete bool   `json:"copy_complete,omitempty"`
 	CanStartCopy bool   `json:"can_start_copy,omitempty"`
 	Message      string `json:"message,omitempty"`
 }
 type InterfaceInfo struct {
 	Name  string
 	State string
@@ -44,12 +55,12 @@ type StaticIPv4Config struct {
 }
 type RemovableTarget struct {
-	Device     string
+	Device     string `json:"device"`
-	FSType     string
+	FSType     string `json:"fs_type"`
-	Size       string
+	Size       string `json:"size"`
-	Label      string
+	Label      string `json:"label"`
-	Model      string
+	Model      string `json:"model"`
-	Mountpoint string
+	Mountpoint string `json:"mountpoint"`
 }
 type ToolStatus struct {
@@ -70,6 +81,7 @@ type NvidiaStressOptions struct {
 	Loader            string
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	StaggerSeconds    int
 }
 func New() *System {
--- a/audit/internal/platform/types_test.go
+++ b/audit/internal/platform/types_test.go
@@ -0,0 +1,31 @@
 package platform
 import (
 	"encoding/json"
 	"strings"
 	"testing"
 )
 func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
 	t.Parallel()
 	data, err := json.Marshal(RemovableTarget{
 		Device: "/dev/sdb1",
 		FSType: "exfat",
 		Size:   "1.8T",
 		Label:  "USB",
 		Model:  "Flash",
 	})
 	if err != nil {
 		t.Fatalf("marshal: %v", err)
 	}
 	raw := string(data)
 	for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
 		if !strings.Contains(raw, key) {
 			t.Fatalf("json missing key %s: %s", key, raw)
 		}
 	}
 	if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
 		t.Fatalf("json still contains Go field names: %s", raw)
 	}
 }
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -15,12 +15,17 @@ type HardwareIngestRequest struct {
 }
 type RuntimeHealth struct {
-	Status        string                 `json:"status"`
+	Status        string `json:"status"`
-	CheckedAt     string                 `json:"checked_at"`
+	CheckedAt     string `json:"checked_at"`
-	ExportDir     string                 `json:"export_dir,omitempty"`
+	ExportDir     string `json:"export_dir,omitempty"`
-	DriverReady   bool                   `json:"driver_ready,omitempty"`
+	DriverReady   bool   `json:"driver_ready,omitempty"`
-	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
+	CUDAReady     bool   `json:"cuda_ready,omitempty"`
-	NetworkStatus string                 `json:"network_status,omitempty"`
+	NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string `json:"network_status,omitempty"`
 	// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
 	ToRAMStatus string `json:"toram_status,omitempty"`
 	// USBExportPath: mount point of the first writable USB drive found, empty if none.
 	USBExportPath string                 `json:"usb_export_path,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
 	Services      []RuntimeServiceStatus `json:"services,omitempty"`
@@ -182,6 +187,13 @@ type HardwarePCIeDevice struct {
 	BatteryTemperatureC    *float64       `json:"battery_temperature_c,omitempty"`
 	BatteryVoltageV        *float64       `json:"battery_voltage_v,omitempty"`
 	BatteryReplaceRequired *bool          `json:"battery_replace_required,omitempty"`
 	SFPPresent             *bool          `json:"sfp_present,omitempty"`
 	SFPIdentifier          *string        `json:"sfp_identifier,omitempty"`
 	SFPConnector           *string        `json:"sfp_connector,omitempty"`
 	SFPVendor              *string        `json:"sfp_vendor,omitempty"`
 	SFPPartNumber          *string        `json:"sfp_part_number,omitempty"`
 	SFPSerialNumber        *string        `json:"sfp_serial_number,omitempty"`
 	SFPWavelengthNM        *float64       `json:"sfp_wavelength_nm,omitempty"`
 	SFPTemperatureC        *float64       `json:"sfp_temperature_c,omitempty"`
 	SFPTXPowerDBM          *float64       `json:"sfp_tx_power_dbm,omitempty"`
 	SFPRXPowerDBM          *float64       `json:"sfp_rx_power_dbm,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -1,7 +1,10 @@
 package webui
 import (
 	"encoding/json"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
@@ -9,30 +12,6 @@ import (
 	"bee/audit/internal/platform"
 )
 func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
 	t.Setenv("DISPLAY", "")
 	t.Setenv("XAUTHORITY", "")
 	cmd := xrandrCommand("--query")
 	var hasDisplay bool
 	var hasXAuthority bool
 	for _, kv := range cmd.Env {
 		if kv == "DISPLAY=:0" {
 			hasDisplay = true
 		}
 		if kv == "XAUTHORITY=/home/bee/.Xauthority" {
 			hasXAuthority = true
 		}
 	}
 	if !hasDisplay {
 		t.Fatalf("DISPLAY not injected: %v", cmd.Env)
 	}
 	if !hasXAuthority {
 		t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
 	}
 }
 func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
@@ -62,8 +41,311 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
 		t.Fatalf("burn profile=%q want smoke", got)
 	}
 	if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
 		t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
 	}
 }
 func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
 	h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
 	h.handleAPIBlackboxStatus(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	var state app.BlackboxState
 	if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
 		t.Fatalf("decode state: %v", err)
 	}
 	if state.Status != "disabled" {
 		t.Fatalf("status=%q want disabled", state.Status)
 	}
 }
 func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
 	exportDir := t.TempDir()
 	statePath := filepath.Join(exportDir, "blackbox-state.json")
 	if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
 		t.Fatalf("write state: %v", err)
 	}
 	h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
 	h.handleAPIBlackboxStatus(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
 		t.Fatalf("body=%s", rec.Body.String())
 	}
 }
 func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 3, Name: "NVIDIA H100 PCIe"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 1 {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
 	if task.Target != "nvidia-bench-perf" {
 		t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
 	}
 	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
 		t.Fatalf("gpu indices=%v want [1 3]", got)
 	}
 	if task.params.RunNCCL {
 		t.Fatal("RunNCCL should reflect explicit false from request")
 	}
 	if task.Priority != taskPriorityBenchmark {
 		t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
 	}
 }
 func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 0, Name: "NVIDIA H100 PCIe"},
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 2, Name: "NVIDIA H200 NVL"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	var resp taskRunResponse
 	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
 		t.Fatalf("decode response: %v", err)
 	}
 	if len(resp.TaskIDs) != 2 {
 		t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 2 {
 		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
 	}
 	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
 		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
 	}
 	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
 		t.Fatalf("task[1] gpu indices=%v want [2]", got)
 	}
 	if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
 		t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
 	}
 	if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
 		t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
 	}
 }
 func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 0, Name: "NVIDIA H100 PCIe"},
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 2, Name: "NVIDIA H100 PCIe"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
 	// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
 	if len(globalQueue.tasks) != 1 {
 		t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
 	if task.Target != "nvidia-bench-power" {
 		t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
 	}
 	if task.Priority != taskPriorityBenchmark {
 		t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
 	}
 	if task.params.RampTotal != 3 {
 		t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
 	}
 }
 func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 1 {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
 	if task.Target != "nvidia-bench-autotune" {
 		t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
 	}
 	if task.params.BenchmarkKind != "power-fit" {
 		t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
 	}
 }
 func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 0, Name: "NVIDIA H100 PCIe"},
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 2, Name: "NVIDIA H200 NVL"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
 	rec := httptest.NewRecorder()
 	h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 2 {
 		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
 	}
 	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
 		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
 	}
 	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
 		t.Fatalf("task[1] gpu indices=%v want [2]", got)
 	}
 	if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
 		t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
 	}
 	if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
 		t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
 	}
 }
 func TestDefaultTaskPriorityOrder(t *testing.T) {
 	got := []int{
 		defaultTaskPriority("install-to-ram", taskParams{}),
 		defaultTaskPriority("audit", taskParams{}),
 		defaultTaskPriority("cpu", taskParams{}),
 		defaultTaskPriority("cpu", taskParams{StressMode: true}),
 		defaultTaskPriority("nvidia-stress", taskParams{}),
 		defaultTaskPriority("nvidia-bench-perf", taskParams{}),
 		defaultTaskPriority("nvidia-bench-power", taskParams{}),
 	}
 	want := []int{
 		taskPriorityInstallToRAM,
 		taskPriorityAudit,
 		taskPriorityValidate,
 		taskPriorityValidateStress,
 		taskPriorityBurn,
 		taskPriorityBenchmark,
 		taskPriorityBenchmark,
 	}
 	for i := range want {
 		if got[i] != want[i] {
 			t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
 		}
 	}
 	if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
 		t.Fatalf("priority order=%v", got)
 	}
 }
 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -0,0 +1,992 @@
 package webui
 import (
 	"fmt"
 	"math"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 	"bee/audit/internal/platform"
 )
 type chartTimelineSegment struct {
 	Start  time.Time
 	End    time.Time
 	Active bool
 }
 type chartScale struct {
 	Min   float64
 	Max   float64
 	Ticks []float64
 }
 type chartLayout struct {
 	Width      int
 	Height     int
 	PlotLeft   int
 	PlotRight  int
 	PlotTop    int
 	PlotBottom int
 }
 type metricChartSeries struct {
 	Name      string
 	AxisTitle string
 	Color     string
 	Values    []float64
 }
 var metricChartPalette = []string{
 	"#5794f2",
 	"#73bf69",
 	"#f2cc0c",
 	"#ff9830",
 	"#f2495c",
 	"#b877d9",
 	"#56d2f7",
 	"#8ab8ff",
 	"#9adf8f",
 	"#ffbe5c",
 }
 var gpuLabelCache struct {
 	mu       sync.Mutex
 	loadedAt time.Time
 	byIndex  map[int]string
 }
 func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	if pointCount == 0 {
 		pointCount = 1
 		labels = []string{""}
 		times = []time.Time{time.Time{}}
 	}
 	if len(labels) < pointCount {
 		padded := make([]string, pointCount)
 		copy(padded, labels)
 		labels = padded
 	}
 	if len(times) < pointCount {
 		times = synthesizeChartTimes(times, pointCount)
 	}
 	for i := range datasets {
 		if len(datasets[i]) == 0 {
 			datasets[i] = make([]float64, pointCount)
 		}
 	}
 	// Downsample to at most ~1400 points (one per pixel) before building SVG.
 	times, datasets = downsampleTimeSeries(times, datasets, 1400)
 	pointCount = len(times)
 	statsLabel := chartStatsLabel(datasets)
 	legendItems := []metricChartSeries{}
 	for i, name := range names {
 		color := metricChartPalette[i%len(metricChartPalette)]
 		values := make([]float64, pointCount)
 		if i < len(datasets) {
 			copy(values, coalesceDataset(datasets[i], pointCount))
 		}
 		legendItems = append(legendItems, metricChartSeries{
 			Name:   name,
 			Color:  color,
 			Values: values,
 		})
 	}
 	scale := singleAxisChartScale(datasets, yMin, yMax)
 	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
 	start, end := chartTimeBounds(times)
 	var b strings.Builder
 	writeSVGOpen(&b, layout.Width, layout.Height)
 	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scale)
 	writeTimelineBoundaries(&b, layout, start, end, timeline)
 	writePlotBorder(&b, layout)
 	writeSingleAxisY(&b, layout, scale)
 	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
 	for _, item := range legendItems {
 		writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
 	}
 	writeLegend(&b, layout, legendItems)
 	writeSVGClose(&b)
 	return []byte(b.String()), nil
 }
 func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
 	temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 	power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 	coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 	if temp == nil && power == nil && coreClock == nil {
 		return nil, false, nil
 	}
 	labels := sampleTimeLabels(samples)
 	times := sampleTimes(samples)
 	svg, err := drawGPUOverviewChartSVG(
 		gpuDisplayLabel(idx)+" Overview",
 		labels,
 		times,
 		[]metricChartSeries{
 			{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
 			{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
 			{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
 		},
 		timeline,
 	)
 	if err != nil {
 		return nil, false, err
 	}
 	return svg, true, nil
 }
 func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
 	if len(series) != 3 {
 		return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
 	}
 	const (
 		width      = 1400
 		height     = 840
 		plotLeft   = 180
 		plotRight  = 1220
 		plotTop    = 96
 		plotBottom = 660
 	)
 	const (
 		leftOuterAxis  = 72
 		leftInnerAxis  = 132
 		rightInnerAxis = 1268
 	)
 	layout := chartLayout{
 		Width:      width,
 		Height:     height,
 		PlotLeft:   plotLeft,
 		PlotRight:  plotRight,
 		PlotTop:    plotTop,
 		PlotBottom: plotBottom,
 	}
 	axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	if pointCount == 0 {
 		pointCount = 1
 		labels = []string{""}
 		times = []time.Time{time.Time{}}
 	}
 	if len(labels) < pointCount {
 		padded := make([]string, pointCount)
 		copy(padded, labels)
 		labels = padded
 	}
 	if len(times) < pointCount {
 		times = synthesizeChartTimes(times, pointCount)
 	}
 	for i := range series {
 		if len(series[i].Values) == 0 {
 			series[i].Values = make([]float64, pointCount)
 		}
 	}
 	// Downsample to at most ~1400 points before building SVG.
 	{
 		datasets := make([][]float64, len(series))
 		for i := range series {
 			datasets[i] = series[i].Values
 		}
 		times, datasets = downsampleTimeSeries(times, datasets, 1400)
 		pointCount = len(times)
 		for i := range series {
 			series[i].Values = datasets[i]
 		}
 	}
 	scales := make([]chartScale, len(series))
 	for i := range series {
 		min, max := chartSeriesBounds(series[i].Values)
 		ticks := chartNiceTicks(min, max, 8)
 		scales[i] = chartScale{
 			Min:   ticks[0],
 			Max:   ticks[len(ticks)-1],
 			Ticks: ticks,
 		}
 	}
 	start, end := chartTimeBounds(times)
 	var b strings.Builder
 	writeSVGOpen(&b, width, height)
 	writeChartFrame(&b, title, "", width, height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scales[0])
 	writeTimelineBoundaries(&b, layout, start, end, timeline)
 	writePlotBorder(&b, layout)
 	for i, axisLineX := range axisX {
 		fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
 			axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
 		fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
 			axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
 		for _, tick := range scales[i].Ticks {
 			y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
 			label := sanitizeChartText(chartYAxisNumber(tick))
 			if i < 2 {
 				fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
 					axisLineX, y, axisLineX+6, y, series[i].Color)
 				fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
 					axisLineX-8, y, series[i].Color, label)
 				continue
 			}
 			fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
 				axisLineX, y, axisLineX-6, y, series[i].Color)
 			fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
 				axisLineX+8, y, series[i].Color, label)
 		}
 	}
 	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
 	for i := range series {
 		writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
 	}
 	writeLegend(&b, layout, series)
 	writeSVGClose(&b)
 	return []byte(b.String()), nil
 }
 func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
 	if len(samples) == 0 {
 		return nil
 	}
 	times := sampleTimes(samples)
 	start, end := chartTimeBounds(times)
 	if start.IsZero() || end.IsZero() {
 		return nil
 	}
 	return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
 }
 func snapshotTaskHistory() []Task {
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	out := make([]Task, len(globalQueue.tasks))
 	for i, t := range globalQueue.tasks {
 		out[i] = *t
 	}
 	return out
 }
 func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
 	if start.IsZero() || end.IsZero() {
 		return nil
 	}
 	if end.Before(start) {
 		start, end = end, start
 	}
 	type interval struct {
 		start time.Time
 		end   time.Time
 	}
 	active := make([]interval, 0, len(tasks))
 	for _, task := range tasks {
 		if task.StartedAt == nil {
 			continue
 		}
 		intervalStart := task.StartedAt.UTC()
 		intervalEnd := now.UTC()
 		if task.DoneAt != nil {
 			intervalEnd = task.DoneAt.UTC()
 		}
 		if !intervalEnd.After(intervalStart) {
 			continue
 		}
 		if intervalEnd.Before(start) || intervalStart.After(end) {
 			continue
 		}
 		if intervalStart.Before(start) {
 			intervalStart = start
 		}
 		if intervalEnd.After(end) {
 			intervalEnd = end
 		}
 		active = append(active, interval{start: intervalStart, end: intervalEnd})
 	}
 	sort.Slice(active, func(i, j int) bool {
 		if active[i].start.Equal(active[j].start) {
 			return active[i].end.Before(active[j].end)
 		}
 		return active[i].start.Before(active[j].start)
 	})
 	merged := make([]interval, 0, len(active))
 	for _, span := range active {
 		if len(merged) == 0 {
 			merged = append(merged, span)
 			continue
 		}
 		last := &merged[len(merged)-1]
 		if !span.start.After(last.end) {
 			if span.end.After(last.end) {
 				last.end = span.end
 			}
 			continue
 		}
 		merged = append(merged, span)
 	}
 	segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
 	cursor := start
 	for _, span := range merged {
 		if span.start.After(cursor) {
 			segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
 		}
 		segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
 		cursor = span.end
 	}
 	if cursor.Before(end) {
 		segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
 	}
 	if len(segments) == 0 {
 		segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
 	}
 	return segments
 }
 func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
 	times := make([]time.Time, 0, len(samples))
 	for _, sample := range samples {
 		times = append(times, sample.Timestamp)
 	}
 	return times
 }
 func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
 	min, max := 0.0, 1.0
 	if yMin != nil && yMax != nil {
 		min, max = *yMin, *yMax
 	} else {
 		min, max = chartSeriesBounds(flattenDatasets(datasets))
 		if yMin != nil {
 			min = *yMin
 		}
 		if yMax != nil {
 			max = *yMax
 		}
 	}
 	ticks := chartNiceTicks(min, max, 8)
 	return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
 }
 func flattenDatasets(datasets [][]float64) []float64 {
 	total := 0
 	for _, ds := range datasets {
 		total += len(ds)
 	}
 	out := make([]float64, 0, total)
 	for _, ds := range datasets {
 		out = append(out, ds...)
 	}
 	return out
 }
 func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
 	legendRows := 0
 	if chartLegendVisible(seriesCount) && seriesCount > 0 {
 		cols := 4
 		if seriesCount < cols {
 			cols = seriesCount
 		}
 		legendRows = (seriesCount + cols - 1) / cols
 	}
 	legendHeight := 0
 	if legendRows > 0 {
 		legendHeight = legendRows*24 + 24
 	}
 	return chartLayout{
 		Width:      1400,
 		Height:     canvasHeight,
 		PlotLeft:   96,
 		PlotRight:  1352,
 		PlotTop:    72,
 		PlotBottom: canvasHeight - 60 - legendHeight,
 	}
 }
 func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
 	if len(times) == 0 {
 		return time.Time{}, time.Time{}
 	}
 	start := times[0].UTC()
 	end := start
 	for _, ts := range times[1:] {
 		t := ts.UTC()
 		if t.Before(start) {
 			start = t
 		}
 		if t.After(end) {
 			end = t
 		}
 	}
 	return start, end
 }
 func synthesizeChartTimes(times []time.Time, count int) []time.Time {
 	if count <= 0 {
 		return nil
 	}
 	if len(times) == count {
 		return times
 	}
 	if len(times) == 1 {
 		out := make([]time.Time, count)
 		for i := range out {
 			out[i] = times[0].Add(time.Duration(i) * time.Minute)
 		}
 		return out
 	}
 	base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
 	out := make([]time.Time, count)
 	for i := range out {
 		out[i] = base.Add(time.Duration(i) * time.Minute)
 	}
 	return out
 }
 // renderStackedMetricChartSVG renders a stacked area chart where each dataset
 // is visually "stacked" on top of the previous one. Intended for multi-PSU
 // power charts where the filled area of each PSU shows its individual
 // contribution and the total height equals the combined draw.
 func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	if pointCount == 0 {
 		pointCount = 1
 		labels = []string{""}
 		times = []time.Time{{}}
 	}
 	if len(labels) < pointCount {
 		padded := make([]string, pointCount)
 		copy(padded, labels)
 		labels = padded
 	}
 	if len(times) < pointCount {
 		times = synthesizeChartTimes(times, pointCount)
 	}
 	for i := range datasets {
 		if len(datasets[i]) == 0 {
 			datasets[i] = make([]float64, pointCount)
 		}
 	}
 	times, datasets = downsampleTimeSeries(times, datasets, 1400)
 	pointCount = len(times)
 	// Build cumulative sums per time point.
 	cumulative := make([][]float64, len(datasets)+1)
 	for i := range cumulative {
 		cumulative[i] = make([]float64, pointCount)
 	}
 	for i, ds := range datasets {
 		for j, v := range ds {
 			cumulative[i+1][j] = cumulative[i][j] + v
 		}
 	}
 	// Scale is based on the total (top cumulative row).
 	total := cumulative[len(cumulative)-1]
 	yMin := floatPtr(0)
 	if yMax == nil {
 		yMax = autoMax120(total)
 	}
 	scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
 	legendItems := make([]metricChartSeries, len(datasets))
 	for i, name := range names {
 		color := metricChartPalette[i%len(metricChartPalette)]
 		legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
 	}
 	// Stats label from totals.
 	statsLabel := chartStatsLabel([][]float64{total})
 	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
 	start, end := chartTimeBounds(times)
 	var b strings.Builder
 	writeSVGOpen(&b, layout.Width, layout.Height)
 	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scale)
 	writeTimelineBoundaries(&b, layout, start, end, timeline)
 	writePlotBorder(&b, layout)
 	writeSingleAxisY(&b, layout, scale)
 	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
 	// Draw stacked areas from top to bottom so lower layers are visible.
 	for i := len(datasets) - 1; i >= 0; i-- {
 		writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
 	}
 	// Draw border polylines on top.
 	for i := len(datasets) - 1; i >= 0; i-- {
 		writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
 	}
 	writeLegend(&b, layout, legendItems)
 	writeSVGClose(&b)
 	return []byte(b.String()), nil
 }
 // writeStackedArea draws a filled polygon between two cumulative value arrays
 // (baseline and top), using the given color at 55% opacity.
 func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
 	n := len(top)
 	if n == 0 {
 		return
 	}
 	if len(baseline) < n {
 		baseline = make([]float64, n)
 	}
 	// Forward path along top values, then backward along baseline values.
 	var points strings.Builder
 	for i := 0; i < n; i++ {
 		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
 		if i > 0 {
 			points.WriteByte(' ')
 		}
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	for i := n - 1; i >= 0; i-- {
 		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
 		points.WriteByte(' ')
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
 }
 func writeSVGOpen(b *strings.Builder, width, height int) {
 	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
 }
 func writeSVGClose(b *strings.Builder) {
 	b.WriteString("</svg>\n")
 }
 func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
 	fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
 	fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
 		width/2, sanitizeChartText(title))
 	if strings.TrimSpace(subtitle) != "" {
 		fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
 			width/2, sanitizeChartText(subtitle))
 	}
 }
 func writePlotBorder(b *strings.Builder, layout chartLayout) {
 	fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
 		layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
 }
 func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
 	b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
 	for _, tick := range scale.Ticks {
 		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
 		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
 			layout.PlotLeft, y, layout.PlotRight, y)
 	}
 	b.WriteString(`</g>` + "\n")
 }
 func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
 	if pointCount <= 0 {
 		return
 	}
 	start, end := chartTimeBounds(times)
 	b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
 	for _, idx := range gpuChartLabelIndices(pointCount, target) {
 		ts := chartPointTime(times, idx)
 		x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
 		fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
 			x, layout.PlotTop, x, layout.PlotBottom)
 	}
 	b.WriteString(`</g>` + "\n")
 }
 func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
 	fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
 		layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
 	for _, tick := range scale.Ticks {
 		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
 		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
 			layout.PlotLeft, y, layout.PlotLeft-6, y)
 		fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
 			layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
 	}
 }
 func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
 	for _, idx := range gpuChartLabelIndices(pointCount, target) {
 		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
 		label := ""
 		if idx < len(labels) {
 			label = labels[idx]
 		}
 		fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
 	}
 	b.WriteString(`</g>` + "\n")
 	fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
 		(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
 }
 func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
 	if len(values) == 0 {
 		return
 	}
 	var points strings.Builder
 	for idx, value := range values {
 		if idx > 0 {
 			points.WriteByte(' ')
 		}
 		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
 		points.String(), color)
 	if len(values) == 1 {
 		x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
 		fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
 		return
 	}
 	peakIdx := 0
 	peakValue := values[0]
 	for idx, value := range values[1:] {
 		if value >= peakValue {
 			peakIdx = idx + 1
 			peakValue = value
 		}
 	}
 	x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
 	y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
 	fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
 	fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
 		x, y-10, x-5, y-18, x+5, y-18, color)
 }
 func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
 	if !chartLegendVisible(len(series)) || len(series) == 0 {
 		return
 	}
 	cols := 4
 	if len(series) < cols {
 		cols = len(series)
 	}
 	cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
 	baseY := layout.PlotBottom + 74
 	for i, item := range series {
 		row := i / cols
 		col := i % cols
 		x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
 		y := float64(baseY + row*24)
 		fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
 			x, y, x+28, y, item.Color)
 		fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
 			x+38, y+4, sanitizeChartText(item.Name))
 	}
 }
 func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
 	if len(segments) == 0 {
 		return
 	}
 	b.WriteString(`<g data-role="timeline-overlay">` + "\n")
 	for _, segment := range segments {
 		if segment.Active || !segment.End.After(segment.Start) {
 			continue
 		}
 		x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
 		x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
 		fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
 			x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
 	}
 	b.WriteString(`</g>` + "\n")
 }
 func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
 	if len(segments) == 0 {
 		return
 	}
 	seen := map[int]bool{}
 	b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
 	for i, segment := range segments {
 		if i > 0 {
 			x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
 			if !seen[x] {
 				seen[x] = true
 				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
 			}
 		}
 		if i < len(segments)-1 {
 			x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
 			if !seen[x] {
 				seen[x] = true
 				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
 			}
 		}
 	}
 	b.WriteString(`</g>` + "\n")
 }
 // downsampleTimeSeries reduces the time series to at most maxPts points using
 // min-max bucketing. Each bucket contributes the index of its min and max value
 // (using the first full-length dataset as the reference series). All parallel
 // datasets are sampled at those same indices so all series stay aligned.
 // If len(times) <= maxPts the inputs are returned unchanged.
 func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
 	n := len(times)
 	if n <= maxPts || maxPts <= 0 {
 		return times, datasets
 	}
 	buckets := maxPts / 2
 	if buckets < 1 {
 		buckets = 1
 	}
 	// Use the first dataset that has the same length as times as the reference
 	// for deciding which two indices to keep per bucket.
 	var ref []float64
 	for _, ds := range datasets {
 		if len(ds) == n {
 			ref = ds
 			break
 		}
 	}
 	selected := make([]int, 0, maxPts)
 	bucketSize := float64(n) / float64(buckets)
 	for b := 0; b < buckets; b++ {
 		lo := int(math.Round(float64(b) * bucketSize))
 		hi := int(math.Round(float64(b+1) * bucketSize))
 		if hi > n {
 			hi = n
 		}
 		if lo >= hi {
 			continue
 		}
 		if ref == nil {
 			selected = append(selected, lo)
 			if hi-1 != lo {
 				selected = append(selected, hi-1)
 			}
 			continue
 		}
 		minIdx, maxIdx := lo, lo
 		for i := lo + 1; i < hi; i++ {
 			if ref[i] < ref[minIdx] {
 				minIdx = i
 			}
 			if ref[i] > ref[maxIdx] {
 				maxIdx = i
 			}
 		}
 		if minIdx <= maxIdx {
 			selected = append(selected, minIdx)
 			if maxIdx != minIdx {
 				selected = append(selected, maxIdx)
 			}
 		} else {
 			selected = append(selected, maxIdx)
 			if minIdx != maxIdx {
 				selected = append(selected, minIdx)
 			}
 		}
 	}
 	outTimes := make([]time.Time, len(selected))
 	for i, idx := range selected {
 		outTimes[i] = times[idx]
 	}
 	outDatasets := make([][]float64, len(datasets))
 	for d, ds := range datasets {
 		if len(ds) != n {
 			outDatasets[d] = ds
 			continue
 		}
 		out := make([]float64, len(selected))
 		for i, idx := range selected {
 			out[i] = ds[idx]
 		}
 		outDatasets[d] = out
 	}
 	return outTimes, outDatasets
 }
 func chartXForTime(ts, start, end time.Time, left, right int) float64 {
 	if !end.After(start) {
 		return float64(left+right) / 2
 	}
 	if ts.Before(start) {
 		ts = start
 	}
 	if ts.After(end) {
 		ts = end
 	}
 	ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
 	return float64(left) + ratio*float64(right-left)
 }
 func chartPointTime(times []time.Time, idx int) time.Time {
 	if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
 		return times[idx].UTC()
 	}
 	if len(times) > 0 && !times[0].IsZero() {
 		return times[0].UTC().Add(time.Duration(idx) * time.Minute)
 	}
 	return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
 }
 func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
 	if scale.Max <= scale.Min {
 		return float64(plotTop+plotBottom) / 2
 	}
 	return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
 }
 func chartSeriesBounds(values []float64) (float64, float64) {
 	if len(values) == 0 {
 		return 0, 1
 	}
 	min, max := values[0], values[0]
 	for _, value := range values[1:] {
 		if value < min {
 			min = value
 		}
 		if value > max {
 			max = value
 		}
 	}
 	if min == max {
 		if max == 0 {
 			return 0, 1
 		}
 		pad := math.Abs(max) * 0.1
 		if pad == 0 {
 			pad = 1
 		}
 		min -= pad
 		max += pad
 	}
 	if min > 0 {
 		pad := (max - min) * 0.2
 		if pad == 0 {
 			pad = max * 0.1
 		}
 		min -= pad
 		if min < 0 {
 			min = 0
 		}
 		max += pad
 	}
 	return min, max
 }
 func chartNiceTicks(min, max float64, target int) []float64 {
 	if min == max {
 		max = min + 1
 	}
 	span := max - min
 	step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
 	for _, factor := range []float64{1, 2, 5, 10} {
 		if span/(factor*step) <= float64(target)*1.5 {
 			step = factor * step
 			break
 		}
 	}
 	low := math.Floor(min/step) * step
 	high := math.Ceil(max/step) * step
 	var ticks []float64
 	for value := low; value <= high+step*0.001; value += step {
 		ticks = append(ticks, math.Round(value*1e9)/1e9)
 	}
 	return ticks
 }
 func valueClamp(value float64, scale chartScale) float64 {
 	if value < scale.Min {
 		return scale.Min
 	}
 	if value > scale.Max {
 		return scale.Max
 	}
 	return value
 }
 func chartStatsLabel(datasets [][]float64) string {
 	mn, avg, mx := globalStats(datasets)
 	if mx <= 0 && avg <= 0 && mn <= 0 {
 		return ""
 	}
 	return fmt.Sprintf("min %s   avg %s   max %s",
 		chartLegendNumber(mn),
 		chartLegendNumber(avg),
 		chartLegendNumber(mx),
 	)
 }
 func gpuDisplayLabel(idx int) string {
 	if name := gpuModelNameByIndex(idx); name != "" {
 		return fmt.Sprintf("GPU %d — %s", idx, name)
 	}
 	return fmt.Sprintf("GPU %d", idx)
 }
 func gpuModelNameByIndex(idx int) string {
 	now := time.Now()
 	gpuLabelCache.mu.Lock()
 	if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
 		gpuLabelCache.loadedAt = now
 		gpuLabelCache.byIndex = loadGPUModelNames()
 	}
 	name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
 	gpuLabelCache.mu.Unlock()
 	return name
 }
 func loadGPUModelNames() map[int]string {
 	out := map[int]string{}
 	gpus, err := platform.New().ListNvidiaGPUs()
 	if err != nil {
 		return out
 	}
 	for _, gpu := range gpus {
 		name := strings.TrimSpace(gpu.Name)
 		if name != "" {
 			out[gpu.Index] = name
 		}
 	}
 	return out
 }
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -1,6 +1,9 @@
 package webui
 import (
 	"bufio"
 	"fmt"
 	"io"
 	"os"
 	"strings"
 	"sync"
@@ -9,13 +12,33 @@ import (
 // jobState holds the output lines and completion status of an async job.
 type jobState struct {
-	lines   []string
+	lines        []string
-	done    bool
+	done         bool
-	err     string
+	err          string
-	mu      sync.Mutex
+	mu           sync.Mutex
-	subs    []chan string
+	subs         []chan string
-	cancel  func() // optional cancel function; nil if job is not cancellable
+	cancel       func() // optional cancel function; nil if job is not cancellable
-	logPath string
+	logPath      string
 	serialPrefix string
 	logFile      *os.File // kept open for the task lifetime to avoid per-line open/close
 	logBuf       *bufio.Writer
 }
 // readTaskLogFile reads a task log, refusing files over 50 MB.
 func readTaskLogFile(path string) ([]byte, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
 	if err != nil {
 		return nil, err
 	}
 	if int64(len(data)) > 50<<20 {
 		return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
 	}
 	return data, nil
 }
 // abort cancels the job if it has a cancel function and is not yet done.
@@ -30,11 +53,22 @@ func (j *jobState) abort() bool {
 }
 func (j *jobState) append(line string) {
 	j.appendWithOptions(line, true, true)
 }
 func (j *jobState) appendFromLog(line string) {
 	j.appendWithOptions(line, false, false)
 }
 func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
-	if j.logPath != "" {
+	if persistLog && j.logPath != "" {
-		appendJobLog(j.logPath, line)
+		j.writeLogLineLocked(line)
 	}
 	if serialMirror && j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
 	}
 	for _, ch := range j.subs {
 		select {
@@ -44,6 +78,35 @@ func (j *jobState) append(line string) {
 	}
 }
 // writeLogLineLocked writes a line to the persistent log file, opening it lazily.
 // Must be called with j.mu held. Uses a buffered writer kept open for the task
 // lifetime — avoids thousands of open/close syscalls during high-frequency logs.
 func (j *jobState) writeLogLineLocked(line string) {
 	if j.logFile == nil {
 		f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
 		if err != nil {
 			return
 		}
 		j.logFile = f
 		j.logBuf = bufio.NewWriterSize(f, 64*1024)
 	}
 	_, _ = j.logBuf.WriteString(line + "\n")
 }
 // closeLog flushes and closes the log file. Called after all task output is done.
 func (j *jobState) closeLog() {
 	j.mu.Lock()
 	defer j.mu.Unlock()
 	if j.logBuf != nil {
 		_ = j.logBuf.Flush()
 	}
 	if j.logFile != nil {
 		_ = j.logFile.Close()
 		j.logFile = nil
 		j.logBuf = nil
 	}
 }
 func (j *jobState) finish(errMsg string) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
@@ -84,12 +147,12 @@ func (m *jobManager) create(id string) *jobState {
 	j := &jobState{}
 	m.jobs[id] = j
 	// Schedule cleanup after 30 minutes
-	go func() {
+	goRecoverOnce("job cleanup", func() {
 		time.Sleep(30 * time.Minute)
 		m.mu.Lock()
 		delete(m.jobs, id)
 		m.mu.Unlock()
-	}()
+	})
 	return j
 }
@@ -107,12 +170,15 @@ func (m *jobManager) get(id string) (*jobState, bool) {
 	return j, ok
 }
-func newTaskJobState(logPath string) *jobState {
+func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	j := &jobState{logPath: logPath}
 	if len(serialPrefix) > 0 {
 		j.serialPrefix = serialPrefix[0]
 	}
 	if logPath == "" {
 		return j
 	}
-	data, err := os.ReadFile(logPath)
+	data, err := readTaskLogFile(logPath)
 	if err != nil || len(data) == 0 {
 		return j
 	}
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -17,10 +17,10 @@ import (
 // It supports multiple concurrent SAT tasks: a shared event window is open
 // while any SAT task is running, and flushed when all tasks complete.
 type kmsgWatcher struct {
-	mu           sync.Mutex
+	mu          sync.Mutex
-	activeCount  int        // number of in-flight SAT tasks
+	activeCount int // number of in-flight SAT tasks
-	window       *kmsgWindow
+	window      *kmsgWindow
-	statusDB     *app.ComponentStatusDB
+	statusDB    *app.ComponentStatusDB
 }
 type kmsgWindow struct {
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
 // start launches the background kmsg reading goroutine.
 func (w *kmsgWatcher) start() {
-	go w.run()
+	goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
 }
 func (w *kmsgWatcher) run() {
-	f, err := os.Open("/dev/kmsg")
+	for {
-	if err != nil {
+		f, err := os.Open("/dev/kmsg")
-		slog.Warn("kmsg watcher unavailable", "err", err)
+		if err != nil {
-		return
+			slog.Warn("kmsg watcher unavailable", "err", err)
-	}
+			time.Sleep(30 * time.Second)
 	defer f.Close()
 	// Best-effort seek to end so we only capture events from now forward.
 	_, _ = f.Seek(0, io.SeekEnd)
 	scanner := bufio.NewScanner(f)
 	scanner.Buffer(make([]byte, 64*1024), 64*1024)
 	for scanner.Scan() {
 		line := scanner.Text()
 		evt, ok := parseKmsgLine(line)
 		if !ok {
 			continue
 		}
-		w.mu.Lock()
+		// Best-effort seek to end so we only capture events from now forward.
-		if w.window != nil {
+		_, _ = f.Seek(0, io.SeekEnd)
-			w.recordEvent(evt)
+
 		scanner := bufio.NewScanner(f)
 		scanner.Buffer(make([]byte, 64*1024), 64*1024)
 		for scanner.Scan() {
 			line := scanner.Text()
 			evt, ok := parseKmsgLine(line)
 			if !ok {
 				continue
 			}
 			w.mu.Lock()
 			if w.window != nil {
 				w.recordEvent(evt)
 			}
 			w.mu.Unlock()
 		}
-		w.mu.Unlock()
+		if err := scanner.Err(); err != nil {
-	}
+			slog.Warn("kmsg watcher stopped", "err", err)
-	if err := scanner.Err(); err != nil {
+		}
-		slog.Warn("kmsg watcher stopped", "err", err)
+		_ = f.Close()
 		time.Sleep(2 * time.Second)
 	}
 }
@@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
 	if window == nil || len(window.events) == 0 {
 		return
 	}
-	go w.flushWindow(window)
+	goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
 }
 func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
@@ -229,7 +232,8 @@ func truncate(s string, max int) string {
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
-	case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
 		return true
--- a/audit/internal/webui/layout.go
+++ b/audit/internal/webui/layout.go
@@ -0,0 +1,137 @@
 package webui
 import (
 	"fmt"
 	"html"
 	"os"
 	"strings"
 )
 func layoutHead(title string) string {
 	return `<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="utf-8">
 <meta name="viewport" content="width=device-width,initial-scale=1">
 <title>` + html.EscapeString(title) + `</title>
 <style>
 :root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
 *{box-sizing:border-box;margin:0;padding:0}
 body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
 a{color:var(--accent);text-decoration:none}
 /* Sidebar */
 .sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
 .sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
 .sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
 .sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
 .sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
 .sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
 .sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
 .nav{flex:1}
 .nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
 .nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
 .nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
 /* Content */
 .main{flex:1;display:flex;flex-direction:column;overflow:auto}
 .topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
 .topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
 .content{padding:24px;flex:1}
 /* Cards */
 .card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
 .card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
 .card-head-actions{justify-content:space-between}
 .card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
 .card-body{padding:16px}
 /* Buttons */
 .btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
 .btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
 .btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
 .btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
 .btn-sm{padding:5px 10px;font-size:12px}
 /* Tables */
 table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
 th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
 td{padding:9px 14px;border-top:1px solid var(--border-lite)}
 tr:first-child td{border-top:0}
 tbody tr:hover td{background:rgba(0,0,0,.03)}
 /* Status badges */
 .badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
 .badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
 .badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
 .badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
 .badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
 /* Component chips — one small square per device */
 .chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
 .chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
 .chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
 .chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
 .chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
 .chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
 /* Output terminal */
 .terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
 .terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
 /* Forms */
 .form-row{margin-bottom:14px}
 .form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
 .form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
 .form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
 /* Grid */
 .grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
 .grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
 /* iframe viewer */
 .viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
 /* Alerts */
 .alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
 .alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
 .alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
 </style>
 </head>
 <body>
 `
 }
 func layoutNav(active string, buildLabel string) string {
 	items := []struct{ id, label, href, onclick string }{
 		{"dashboard", "Dashboard", "/", ""},
 		{"audit", "Audit", "/audit", ""},
 		{"validate", "Validate", "/validate", ""},
 		{"burn", "Burn", "/burn", ""},
 		{"benchmark", "Benchmark", "/benchmark", ""},
 		{"tasks", "Tasks", "/tasks", ""},
 		{"tools", "Tools", "/tools", ""},
 	}
 	var b strings.Builder
 	b.WriteString(`<aside class="sidebar">`)
 	b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
 	if strings.TrimSpace(buildLabel) == "" {
 		buildLabel = "dev"
 	}
 	b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
 	if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
 		gspMode := strings.TrimSpace(string(raw))
 		switch gspMode {
 		case "gsp-off":
 			b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
 		case "gsp-stuck":
 			b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
 		}
 	}
 	b.WriteString(`<nav class="nav">`)
 	for _, item := range items {
 		cls := "nav-item"
 		if item.id == active {
 			cls += " active"
 		}
 		if item.onclick != "" {
 			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
 				cls, item.href, item.onclick, item.label))
 		} else {
 			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
 				cls, item.href, item.label))
 		}
 	}
 	b.WriteString(`</nav>`)
 	b.WriteString(`</aside>`)
 	return b.String()
 }
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"sort"
 	"strconv"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
@@ -21,6 +22,13 @@ type MetricsDB struct {
 	db *sql.DB
 }
 func (m *MetricsDB) Close() error {
 	if m == nil || m.db == nil {
 		return nil
 	}
 	return m.db.Close()
 }
 // openMetricsDB opens (or creates) the metrics database at the given path.
 func openMetricsDB(path string) (*MetricsDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
@@ -45,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
  power_source TEXT,
  power_mode   TEXT,
  power_reason TEXT,
  PRIMARY KEY (ts)
 );
 CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -54,6 +65,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
  clock_mhz     REAL,
  mem_clock_mhz REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE IF NOT EXISTS fan_metrics (
@@ -70,6 +83,47 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
  PRIMARY KEY (ts, name)
 );
 `)
 	if err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
 		return err
 	}
 	return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
 }
 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
 	rows, err := db.Query("PRAGMA table_info(" + table + ")")
 	if err != nil {
 		return err
 	}
 	defer rows.Close()
 	for rows.Next() {
 		var cid int
 		var name, ctype string
 		var notNull, pk int
 		var dflt sql.NullString
 		if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
 			return err
 		}
 		if strings.EqualFold(name, column) {
 			return nil
 		}
 	}
 	if err := rows.Err(); err != nil {
 		return err
 	}
 	_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
 	return err
 }
@@ -83,16 +137,16 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	defer func() { _ = tx.Rollback() }()
 	_, err = tx.Exec(
-		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
-		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
 	)
 	if err != nil {
 		return err
 	}
 	for _, g := range s.GPUs {
 		_, err = tx.Exec(
-			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
+			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
-			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
+			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
 		)
 		if err != nil {
 			return err
@@ -119,14 +173,81 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	return tx.Commit()
 }
 // Downsample reduces density of old metrics rows to 1 sample per minute.
 // Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
 // affected — rows newer than downsampleBefore keep full 5-second resolution.
 // For each 60-second bucket the row with the smallest ts is kept; the rest
 // are deleted. This trims ~92 % of rows in that window while preserving
 // the overall shape of every chart.
 //
 // Called hourly by the metrics collector background goroutine.
 func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
 	if m == nil || m.db == nil {
 		return nil
 	}
 	start := deleteOlderThan.Unix()
 	end := downsampleBefore.Unix()
 	if end <= start {
 		return nil
 	}
 	// For each table: delete rows in [start, end) whose ts is NOT the minimum
 	// ts in its 60-second bucket (ts/60 integer division = bucket ID).
 	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
 		_, err := m.db.Exec(`
 DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
  AND ts NOT IN (
    SELECT MIN(ts) FROM `+table+`
    WHERE ts >= ? AND ts < ?
    GROUP BY ts / 60
  )`, start, end, start, end)
 		if err != nil {
 			return err
 		}
 	}
 	return nil
 }
 // Prune deletes all rows older than the given cutoff from every metrics table.
 // Called hourly by the metrics collector to keep the DB size bounded.
 func (m *MetricsDB) Prune(before time.Time) error {
 	if m == nil || m.db == nil {
 		return nil
 	}
 	cutTS := before.Unix()
 	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
 		if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
 			return err
 		}
 	}
 	_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
 	return nil
 }
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
 }
 // LoadAll returns all persisted samples in chronological order (oldest first).
 func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
 }
 // LoadBetween returns samples in chronological order within the given time window.
 func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
 	if m == nil {
 		return nil, nil
 	}
 	if start.IsZero() || end.IsZero() {
 		return nil, nil
 	}
 	if end.Before(start) {
 		start, end = end, start
 	}
 	return m.loadSamples(
 		`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
 		start.Unix(), end.Unix(),
 	)
 }
 // loadSamples reconstructs LiveMetricSample rows from the normalized tables.
@@ -140,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	type sysRow struct {
 		ts            int64
 		cpu, mem, pwr float64
 		powerSource   string
 		powerMode     string
 		powerReason   string
 	}
 	var sysRows []sysRow
 	for rows.Next() {
 		var r sysRow
-		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
 			continue
 		}
 		sysRows = append(sysRows, r)
@@ -163,7 +287,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	}
 	gpuData := map[gpuKey]platform.GPUMetricRow{}
 	gRows, err := m.db.Query(
-		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
+		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
 		minTS, maxTS,
 	)
 	if err == nil {
@@ -171,7 +295,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 		for gRows.Next() {
 			var ts int64
 			var g platform.GPUMetricRow
-			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
+			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
 				gpuData[gpuKey{ts, g.GPUIndex}] = g
 			}
 		}
@@ -254,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	samples := make([]platform.LiveMetricSample, len(sysRows))
 	for i, r := range sysRows {
 		s := platform.LiveMetricSample{
-			Timestamp:  time.Unix(r.ts, 0).UTC(),
+			Timestamp:   time.Unix(r.ts, 0).UTC(),
-			CPULoadPct: r.cpu,
+			CPULoadPct:  r.cpu,
-			MemLoadPct: r.mem,
+			MemLoadPct:  r.mem,
-			PowerW:     r.pwr,
+			PowerW:      r.pwr,
 			PowerSource: r.powerSource,
 			PowerMode:   r.powerMode,
 			PowerReason: r.powerReason,
 		}
 		for _, idx := range gpuIndices {
 			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
@@ -283,7 +410,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	rows, err := m.db.Query(`
 		SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
-		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
+		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
 		       g.clock_mhz, g.mem_clock_mhz
 		FROM sys_metrics s
 		LEFT JOIN gpu_metrics g ON g.ts = s.ts
 		ORDER BY s.ts, g.gpu_index
@@ -294,13 +422,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	defer rows.Close()
 	cw := csv.NewWriter(w)
-	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
+	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
 	for rows.Next() {
 		var ts int64
 		var cpu, mem, pwr float64
 		var gpuIdx sql.NullInt64
-		var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
+		var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
-		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
+		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
 			continue
 		}
 		row := []string{
@@ -316,9 +444,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 				strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
 			)
 		} else {
-			row = append(row, "", "", "", "", "")
+			row = append(row, "", "", "", "", "", "", "")
 		}
 		_ = cw.Write(row)
 	}
@@ -326,9 +456,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	return cw.Error()
 }
 // Close closes the database.
 func (m *MetricsDB) Close() { _ = m.db.Close() }
 func nullFloat(v float64) sql.NullFloat64 {
 	return sql.NullFloat64{Float64: v, Valid: true}
 }
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -1,11 +1,13 @@
 package webui
 import (
 	"database/sql"
 	"path/filepath"
 	"testing"
 	"time"
 	"bee/audit/internal/platform"
 	_ "modernc.org/sqlite"
 )
 func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
@@ -67,3 +69,106 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
 		}
 	}
 }
 func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
 	path := filepath.Join(t.TempDir(), "metrics.db")
 	raw, err := sql.Open("sqlite", path)
 	if err != nil {
 		t.Fatalf("sql.Open: %v", err)
 	}
 	_, err = raw.Exec(`
 CREATE TABLE gpu_metrics (
  ts            INTEGER NOT NULL,
  gpu_index     INTEGER NOT NULL,
  temp_c        REAL,
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE sys_metrics (
  ts           INTEGER NOT NULL,
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
  PRIMARY KEY (ts)
 );
 CREATE TABLE fan_metrics (
  ts   INTEGER NOT NULL,
  name TEXT NOT NULL,
  rpm  REAL,
  PRIMARY KEY (ts, name)
 );
 CREATE TABLE temp_metrics (
  ts      INTEGER NOT NULL,
  name    TEXT NOT NULL,
  grp     TEXT NOT NULL,
  celsius REAL,
  PRIMARY KEY (ts, name)
 );
 `)
 	if err != nil {
 		t.Fatalf("create legacy schema: %v", err)
 	}
 	_ = raw.Close()
 	db, err := openMetricsDB(path)
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	defer db.Close()
 	now := time.Unix(1_700_000_100, 0).UTC()
 	err = db.Write(platform.LiveMetricSample{
 		Timestamp: now,
 		GPUs: []platform.GPUMetricRow{
 			{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
 		},
 	})
 	if err != nil {
 		t.Fatalf("Write: %v", err)
 	}
 	samples, err := db.LoadAll()
 	if err != nil {
 		t.Fatalf("LoadAll: %v", err)
 	}
 	if len(samples) != 1 || len(samples[0].GPUs) != 1 {
 		t.Fatalf("samples=%+v", samples)
 	}
 	if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
 		t.Fatalf("ClockMHz=%v want 1410", got)
 	}
 	if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
 		t.Fatalf("MemClockMHz=%v want 2600", got)
 	}
 }
 func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
 	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	defer db.Close()
 	base := time.Unix(1_700_000_000, 0).UTC()
 	for i := 0; i < 5; i++ {
 		if err := db.Write(platform.LiveMetricSample{
 			Timestamp:  base.Add(time.Duration(i) * time.Minute),
 			CPULoadPct: float64(i),
 		}); err != nil {
 			t.Fatalf("Write(%d): %v", i, err)
 		}
 	}
 	got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
 	if err != nil {
 		t.Fatalf("LoadBetween: %v", err)
 	}
 	if len(got) != 3 {
 		t.Fatalf("LoadBetween len=%d want 3", len(got))
 	}
 	if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
 		t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
 	}
 }
--- a/audit/internal/webui/page_benchmark.go
+++ b/audit/internal/webui/page_benchmark.go
@@ -0,0 +1,613 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"os"
 	"path/filepath"
 	"sort"
 	"strconv"
 	"strings"
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 type benchmarkHistoryRun struct {
 	generatedAt   time.Time
 	displayTime   string
 	gpuScores     map[int]float64
 	gpuStatuses   map[int]string
 	overallStatus string
 }
 func renderBenchmark(opts HandlerOptions) string {
 	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 <div class="grid2">
  <div class="card">
    <div class="card-head">Benchmark Setup</div>
    <div class="card-body">
      <div class="form-row">
        <label>Profile</label>
        <select id="benchmark-profile">
          <option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
          <option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
          <option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
        </select>
      </div>
      <div class="form-row">
        <label>GPU Selection</label>
        <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
        </div>
        <div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
        </div>
      </div>
      <label class="benchmark-cb-row">
        <input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
        <span>Sequential — one GPU at a time</span>
      </label>
      <label class="benchmark-cb-row" id="benchmark-parallel-label">
        <input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
        <span>Parallel — all selected GPUs simultaneously</span>
      </label>
      <label class="benchmark-cb-row" id="benchmark-ramp-label">
        <input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
      </label>
      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
        <button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
      </div>
      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
      <div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
      <div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
    </div>
  </div>
  <div class="card">
    <div class="card-head">Method Split</div>
    <div class="card-body">
      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
      <table>
        <tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
        <tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
      </table>
      <p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
    </div>
  </div>
 </div>
 ` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
  <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
 </div>
 <style>
 .benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
 .benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 .benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
 .benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 </style>
 <script>
 let benchmarkES = null;
 function benchmarkTaskIDs(payload) {
  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
  if (payload && payload.task_id) return [payload.task_id];
  return [];
 }
 function benchmarkSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
    .map(function(el) { return parseInt(el.value, 10); })
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
 function benchmarkMode() {
  const el = document.querySelector('input[name="benchmark-mode"]:checked');
  return el ? el.value : 'sequential';
 }
 function benchmarkUpdateSelectionNote() {
  const selected = benchmarkSelectedGPUIndices();
  const perfBtn = document.getElementById('benchmark-run-performance-btn');
  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
  const note = document.getElementById('benchmark-selection-note');
  if (!selected.length) {
    perfBtn.disabled = true;
    fitBtn.disabled = true;
    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
    return;
  }
  perfBtn.disabled = false;
  fitBtn.disabled = false;
  const mode = benchmarkMode();
  if (mode === 'ramp-up') {
    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
  } else if (mode === 'parallel') {
    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
  } else {
    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
  }
 }
 function benchmarkRenderGPUList(gpus) {
  const root = document.getElementById('benchmark-gpu-list');
  if (!gpus || !gpus.length) {
    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
    benchmarkUpdateSelectionNote();
    return;
  }
  root.innerHTML = gpus.map(function(gpu) {
    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
    return '<label class="benchmark-gpu-row">'
      + '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  benchmarkApplyMultiGPUState(gpus.length);
  benchmarkUpdateSelectionNote();
 }
 function benchmarkApplyMultiGPUState(gpuCount) {
  var multiValues = ['parallel', 'ramp-up'];
  var radios = document.querySelectorAll('input[name="benchmark-mode"]');
  radios.forEach(function(el) {
    var isMulti = multiValues.indexOf(el.value) >= 0;
    if (gpuCount < 2 && isMulti) {
      el.disabled = true;
      if (el.checked) {
        var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
        if (seq) seq.checked = true;
      }
      var label = el.closest('label');
      if (label) label.style.opacity = '0.4';
    } else {
      el.disabled = false;
      if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
      var label = el.closest('label');
      if (label) label.style.opacity = '';
    }
  });
  benchmarkUpdateSelectionNote();
 }
 function benchmarkLoadGPUs() {
  const status = document.getElementById('benchmark-run-status');
  status.textContent = '';
  fetch('/api/gpu/nvidia').then(function(r) {
    return r.json().then(function(body) {
      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
      return body;
    });
  }).then(function(gpus) {
    benchmarkRenderGPUList(gpus);
  }).catch(function(err) {
    document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
    benchmarkUpdateSelectionNote();
  });
 }
 function benchmarkSelectAll() {
  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
  benchmarkUpdateSelectionNote();
 }
 function benchmarkSelectNone() {
  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
  benchmarkUpdateSelectionNote();
 }
 function runNvidiaBenchmark(kind) {
  const selected = benchmarkSelectedGPUIndices();
  const status = document.getElementById('benchmark-run-status');
  if (!selected.length) {
    status.textContent = 'Select at least one GPU.';
    return;
  }
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  const mode = benchmarkMode();
  const rampUp = mode === 'ramp-up' && selected.length > 1;
  const parallelGPUs = mode === 'parallel' && kind === 'performance';
  if (kind === 'power-fit' && mode === 'parallel') {
    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
    return;
  }
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
    run_nccl: kind === 'performance' && selected.length > 1,
    parallel_gpus: parallelGPUs,
    ramp_up: rampUp,
    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
  const term = document.getElementById('benchmark-terminal');
  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
  status.textContent = 'Queueing...';
  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
  fetch(endpoint, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(body)
  }).then(function(r) {
    return r.json().then(function(payload) {
      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
      return payload;
    });
  }).then(function(d) {
    const taskIds = benchmarkTaskIDs(d);
    if (!taskIds.length) throw new Error('No benchmark task was queued.');
    status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
    const streamNext = function(idx, failures) {
      if (idx >= taskIds.length) {
        status.textContent = failures ? 'Completed with failures.' : 'Completed.';
        return;
      }
      const taskId = taskIds[idx];
      term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
      benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
      benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
      benchmarkES.addEventListener('done', function(e) {
        benchmarkES.close();
        benchmarkES = null;
        if (e.data) failures += 1;
        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
        term.scrollTop = term.scrollHeight;
        const isLast = (idx + 1 >= taskIds.length);
        streamNext(idx + 1, failures);
        if (isLast) { benchmarkRefreshResults(); }
      });
      benchmarkES.onerror = function() {
        if (benchmarkES) {
          benchmarkES.close();
          benchmarkES = null;
        }
        term.textContent += '\nERROR: stream disconnected.\n';
        term.scrollTop = term.scrollHeight;
        streamNext(idx + 1, failures + 1);
      };
    };
    streamNext(0, 0);
  }).catch(function(err) {
    status.textContent = 'Error.';
    term.textContent += 'ERROR: ' + err.message + '\n';
  });
 }
 function benchmarkRenderAutotuneStatus(payload) {
  const el = document.getElementById('benchmark-autotune-status');
  if (!el) return;
  if (!payload || !payload.configured || !payload.config) {
    el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
    return;
  }
  const cfg = payload.config || {};
  const decision = payload.decision || {};
  const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
  const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
  const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
  const mode = decision.mode ? (' · mode ' + decision.mode) : '';
  el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
 }
 function loadBenchmarkAutotuneStatus() {
  fetch('/api/bee-bench/nvidia/autotune/status')
    .then(function(r) {
      return r.json().then(function(body) {
        if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
        return body;
      });
    })
    .then(function(body) { benchmarkRenderAutotuneStatus(body); })
    .catch(function(err) {
      const el = document.getElementById('benchmark-autotune-status');
      if (el) el.textContent = 'Autotune status error: ' + err.message;
    });
 }
 function runBenchmarkAutotune() {
  const selected = benchmarkSelectedGPUIndices();
  const status = document.getElementById('benchmark-run-status');
  const term = document.getElementById('benchmark-terminal');
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  document.getElementById('benchmark-output').style.display = 'block';
  document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
  term.textContent = 'Enqueuing benchmark autotune...\n';
  status.textContent = 'Queueing autotune...';
  fetch('/api/bee-bench/nvidia/autotune/run', {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify({
      profile: document.getElementById('benchmark-profile').value || 'standard',
      benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
      gpu_indices: selected
    })
  }).then(function(r) {
    return r.json().then(function(payload) {
      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
      return payload;
    });
  }).then(function(d) {
    const taskIds = benchmarkTaskIDs(d);
    if (!taskIds.length) throw new Error('No autotune task was queued.');
    const taskId = taskIds[0];
    status.textContent = 'Autotune queued: ' + taskId;
    benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
    benchmarkES.addEventListener('done', function(e) {
      if (benchmarkES) {
        benchmarkES.close();
        benchmarkES = null;
      }
      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
      status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
      loadBenchmarkAutotuneStatus();
    });
  }).catch(function(err) {
    status.textContent = 'Autotune error.';
    term.textContent += 'ERROR: ' + err.message + '\n';
  });
 }
 benchmarkLoadGPUs();
 loadBenchmarkAutotuneStatus();
 function benchmarkRefreshResults() {
  fetch('/api/benchmark/results')
    .then(function(r) { return r.text(); })
    .then(function(html) {
      const el = document.getElementById('benchmark-results-section');
      if (el) el.innerHTML = html;
    })
    .catch(function() {});
 }
 </script>`
 }
 func renderBenchmarkResultsCard(exportDir string) string {
 	maxIdx, runs := loadBenchmarkHistory(exportDir)
 	perf := renderBenchmarkResultsCardFromRuns(
 		"Perf Results",
 		"Composite score by saved benchmark run and GPU.",
 		"No saved performance benchmark runs yet.",
 		maxIdx,
 		runs,
 	)
 	power := renderPowerBenchmarkResultsCard(exportDir)
 	return perf + "\n" + power
 }
 func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
 	if len(runs) == 0 {
 		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
 	}
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
 	if strings.TrimSpace(description) != "" {
 		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
 	}
 	b.WriteString(`<div style="overflow-x:auto">`)
 	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
 	for i := 0; i <= maxGPUIndex; i++ {
 		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
 	}
 	b.WriteString(`</tr></thead><tbody>`)
 	for i, run := range runs {
 		b.WriteString(`<tr>`)
 		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
 		overallColor := "var(--ok)"
 		overallLabel := run.overallStatus
 		if overallLabel == "" {
 			overallLabel = "OK"
 		}
 		if overallLabel == "FAILED" {
 			overallColor = "var(--crit-fg,#9f3a38)"
 		} else if overallLabel != "OK" {
 			overallColor = "var(--warn)"
 		}
 		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
 		for idx := 0; idx <= maxGPUIndex; idx++ {
 			score, ok := run.gpuScores[idx]
 			if !ok {
 				b.WriteString(`<td style="color:var(--muted)">-</td>`)
 				continue
 			}
 			gpuStatus := run.gpuStatuses[idx]
 			scoreColor := ""
 			switch gpuStatus {
 			case "FAILED":
 				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
 			case "WARNING", "PARTIAL":
 				scoreColor = ` style="color:var(--warn);font-weight:600"`
 			case "", "OK":
 			default:
 				scoreColor = ` style="color:var(--warn);font-weight:600"`
 			}
 			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
 		}
 		b.WriteString(`</tr>`)
 	}
 	b.WriteString(`</tbody></table></div></div></div>`)
 	return b.String()
 }
 func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
 	baseDir := app.DefaultBeeBenchPerfDir
 	if strings.TrimSpace(exportDir) != "" {
 		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
 	}
 	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
 	if err != nil || len(paths) == 0 {
 		return -1, nil
 	}
 	sort.Strings(paths)
 	return loadBenchmarkHistoryFromPaths(paths)
 }
 func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
 	runs := make([]benchmarkHistoryRun, 0, len(paths))
 	maxGPUIndex := -1
 	for _, path := range paths {
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			continue
 		}
 		var result platform.NvidiaBenchmarkResult
 		if err := json.Unmarshal(raw, &result); err != nil {
 			continue
 		}
 		run := benchmarkHistoryRun{
 			generatedAt:   result.GeneratedAt,
 			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
 			gpuScores:     make(map[int]float64),
 			gpuStatuses:   make(map[int]string),
 			overallStatus: result.OverallStatus,
 		}
 		for _, gpu := range result.GPUs {
 			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
 			run.gpuStatuses[gpu.Index] = gpu.Status
 			if gpu.Index > maxGPUIndex {
 				maxGPUIndex = gpu.Index
 			}
 		}
 		runs = append(runs, run)
 	}
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
 	})
 	return maxGPUIndex, runs
 }
 func renderPowerBenchmarkResultsCard(exportDir string) string {
 	baseDir := app.DefaultBeeBenchPowerDir
 	if strings.TrimSpace(exportDir) != "" {
 		baseDir = filepath.Join(exportDir, "bee-bench", "power")
 	}
 	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
 	if err != nil || len(paths) == 0 {
 		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
 	}
 	sort.Strings(paths)
 	type powerRun struct {
 		generatedAt time.Time
 		displayTime string
 		result      platform.NvidiaPowerBenchResult
 	}
 	var runs []powerRun
 	for _, path := range paths {
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			continue
 		}
 		var r platform.NvidiaPowerBenchResult
 		if err := json.Unmarshal(raw, &r); err != nil {
 			continue
 		}
 		runs = append(runs, powerRun{
 			generatedAt: r.GeneratedAt,
 			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
 			result:      r,
 		})
 	}
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
 	})
 	var b strings.Builder
 	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
 	latest := runs[0].result
 	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
 	if latest.Hostname != "" {
 		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
 	}
 	if latest.OverallStatus != "" {
 		statusColor := "var(--ok)"
 		if latest.OverallStatus != "OK" {
 			statusColor = "var(--warn)"
 		}
 		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
 	}
 	b.WriteString(`</p>`)
 	if len(latest.GPUs) > 0 {
 		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
 		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
 		b.WriteString(`</tr></thead><tbody>`)
 		for _, gpu := range latest.GPUs {
 			finalLimitW := gpu.StablePowerLimitW
 			if finalLimitW <= 0 {
 				finalLimitW = gpu.AppliedPowerLimitW
 			}
 			derated := gpu.Derated ||
 				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
 			rowStyle := ""
 			finalStyle := ""
 			if derated {
 				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
 				finalStyle = ` style="color:#e6a000;font-weight:600"`
 			}
 			statusLabel := gpu.Status
 			if statusLabel == "" {
 				statusLabel = "OK"
 			}
 			statusColor := "var(--ok)"
 			if statusLabel == "FAILED" {
 				statusColor = "var(--crit-fg,#9f3a38)"
 			} else if statusLabel != "OK" {
 				statusColor = "var(--warn)"
 			}
 			nominalStr := "-"
 			if gpu.DefaultPowerLimitW > 0 {
 				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
 			}
 			singleStr := "-"
 			if gpu.AppliedPowerLimitW > 0 {
 				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
 			}
 			multiStr := "-"
 			if gpu.StablePowerLimitW > 0 {
 				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
 			}
 			p95Str := "-"
 			if gpu.MaxObservedPowerW > 0 {
 				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
 			}
 			b.WriteString(`<tr` + rowStyle + `>`)
 			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
 			b.WriteString(`<td>` + nominalStr + `</td>`)
 			b.WriteString(`<td>` + singleStr + `</td>`)
 			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
 			b.WriteString(`<td>` + p95Str + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
 			b.WriteString(`</tr>`)
 		}
 		b.WriteString(`</tbody></table></div>`)
 	}
 	if len(runs) > 1 {
 		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
 		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
 		for i, run := range runs {
 			statusColor := "var(--ok)"
 			if run.result.OverallStatus != "OK" {
 				statusColor = "var(--warn)"
 			}
 			b.WriteString(`<tr>`)
 			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
 			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
 			b.WriteString(`</tr>`)
 		}
 		b.WriteString(`</tbody></table></div></details>`)
 	}
 	b.WriteString(`</div></div>`)
 	return b.String()
 }
--- a/audit/internal/webui/page_burn.go
+++ b/audit/internal/webui/page_burn.go
@@ -0,0 +1,383 @@
 package webui
 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
 <div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Burn Profile</div>
  <div class="card-body burn-profile-body">
    <div class="burn-profile-col">
      <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
      <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
      <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
    </div>
    <div class="burn-profile-col burn-profile-action">
      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
      <p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
    </div>
    <div class="burn-profile-col burn-profile-action">
      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
    </div>
  </div>
  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
    <span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">NVIDIA GPU Selection</div>
  <div class="card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
    </div>
 	    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
 	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
 	    </div>
 	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
 	    <div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
 	      <label class="cb-row">
 	        <input type="radio" name="burn-nvidia-mode" value="sequential" checked>
 	        <span>Sequential — selected GPUs one at a time</span>
 	      </label>
 	      <label class="cb-row" id="burn-parallel-label">
 	        <input type="radio" name="burn-nvidia-mode" value="parallel">
 	        <span>Parallel — all selected GPUs simultaneously</span>
 	      </label>
 	      <label class="cb-row" id="burn-ramp-label">
 	        <input type="radio" name="burn-nvidia-mode" value="ramp-up">
 	        <span>Ramp-up — add one GPU at a time</span>
 	      </label>
 	    </div>
 	  </div>
 	</div>
 <div class="burn-section">Core Burn Paths</div>
 <div class="grid2 burn-grid" style="margin-bottom:16px">
 <div class="card burn-card">
  <div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
  <div class="card-body burn-card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
    <label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
    <label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
    <label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
    <label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
  </div>
 </div>
 <div class="card burn-card">
  <div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
  <div class="card-body burn-card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
    <label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
    <label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
    <label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
  </div>
 </div>
 </div>
 <div id="bi-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Output <span id="bi-title"></span></div>
  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
 </div>
 <style>
 .cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
 .cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 .cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
 .cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
 .cb-note { font-size:11px; color:var(--muted); font-style:italic; }
 .burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
 .burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 .burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
 .burn-profile-col { min-width:0; }
 .burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
 .burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
 .burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
 .burn-grid { align-items:stretch; }
 .burn-card { height:100%; display:flex; flex-direction:column; }
 .burn-card-body { flex:1; display:flex; flex-direction:column; }
 .card-head-actions { justify-content:space-between; }
 .card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
 </style>
 <script>
 let biES = null;
 function burnTaskIDs(payload) {
  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
  if (payload && payload.task_id) return [payload.task_id];
  return [];
 }
 function burnProfile() {
  const selected = document.querySelector('input[name="burn-profile"]:checked');
  return selected ? selected.value : 'smoke';
 }
 function burnSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
    .map(function(el) { return parseInt(el.value, 10); })
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
 function burnNvidiaMode() {
  const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
  return el ? el.value : 'sequential';
 }
 function burnApplyMultiGPUState(gpuCount) {
  var multiValues = ['parallel', 'ramp-up'];
  var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
  radios.forEach(function(el) {
    var isMulti = multiValues.indexOf(el.value) >= 0;
    if (gpuCount < 2 && isMulti) {
      el.disabled = true;
      if (el.checked) {
        var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
        if (seq) seq.checked = true;
      }
      var label = el.closest('label');
      if (label) label.style.opacity = '0.4';
    } else {
      el.disabled = false;
      var label = el.closest('label');
      if (label) label.style.opacity = '';
    }
  });
 }
 function burnUpdateSelectionNote() {
  const note = document.getElementById('burn-selection-note');
  const selected = burnSelectedGPUIndices();
  if (!selected.length) {
    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
    return;
  }
  note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
 }
 function burnRenderGPUList(gpus) {
  const root = document.getElementById('burn-gpu-list');
  if (!gpus || !gpus.length) {
    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
    burnUpdateSelectionNote();
    return;
  }
  root.innerHTML = gpus.map(function(gpu) {
    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
    return '<label class="burn-gpu-row">'
      + '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  burnApplyMultiGPUState(gpus.length);
  burnUpdateSelectionNote();
 }
 function burnSelectAll() {
  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
  burnUpdateSelectionNote();
 }
 function burnSelectNone() {
  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
  burnUpdateSelectionNote();
 }
 function burnLoadGPUs() {
  fetch('/api/gpu/nvidia').then(function(r) {
    return r.json().then(function(body) {
      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
      return body;
    });
  }).then(function(gpus) {
    burnRenderGPUList(gpus);
  }).catch(function(err) {
    document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
    burnUpdateSelectionNote();
  });
 }
 function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
  const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
  if (useSelectedNvidia) {
    const selected = burnSelectedGPUIndices();
    if (!selected.length) {
      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
    }
    body.gpu_indices = selected;
    const bMode = burnNvidiaMode();
    if (bMode === 'ramp-up' && selected.length > 1) {
      body.stagger_gpu_start = true;
    } else if (bMode === 'parallel' && selected.length > 1) {
      body.parallel_gpus = true;
    }
  }
  return fetch('/api/sat/' + target + '/run', {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(body)
  }).then(function(r) {
    return r.json().then(function(payload) {
      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
      return payload;
    });
  });
 }
 function streamTask(taskId, label) {
  if (biES) { biES.close(); biES = null; }
  document.getElementById('bi-output').style.display = 'block';
  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
  const term = document.getElementById('bi-terminal');
  term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
  biES = new EventSource('/api/tasks/' + taskId + '/stream');
  biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
  biES.addEventListener('done', function(e) {
    biES.close();
    biES = null;
    term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
    term.scrollTop = term.scrollHeight;
  });
 }
 function streamBurnTask(taskId, label, resetTerminal) {
  return streamBurnTaskSet([taskId], label, resetTerminal);
 }
 function streamBurnTaskSet(taskIds, label, resetTerminal) {
  if (biES) { biES.close(); biES = null; }
  document.getElementById('bi-output').style.display = 'block';
  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
  const term = document.getElementById('bi-terminal');
  if (resetTerminal) {
    term.textContent = '';
  }
  if (!Array.isArray(taskIds) || !taskIds.length) {
    term.textContent += 'ERROR: no tasks queued.\n';
    return Promise.resolve({ok:false, error:'no tasks queued'});
  }
  const streamNext = function(idx, failures) {
    if (idx >= taskIds.length) {
      return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
    }
    const taskId = taskIds[idx];
    term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
    return new Promise(function(resolve) {
      biES = new EventSource('/api/tasks/' + taskId + '/stream');
      biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
      biES.addEventListener('done', function(e) {
        biES.close();
        biES = null;
        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
        term.scrollTop = term.scrollHeight;
        resolve(failures + (e.data ? 1 : 0));
      });
      biES.onerror = function() {
        if (biES) {
          biES.close();
          biES = null;
        }
        term.textContent += '\nERROR: stream disconnected.\n';
        term.scrollTop = term.scrollHeight;
        resolve(failures + 1);
      };
    }).then(function(nextFailures) {
      return streamNext(idx + 1, nextFailures);
    });
  };
  return streamNext(0, 0);
 }
 function runBurnTaskSet(tasks, statusElId) {
  const enabled = tasks.filter(function(t) {
    const el = document.getElementById(t.id);
    return el && el.checked && !el.disabled;
  });
  const status = statusElId ? document.getElementById(statusElId) : null;
  if (status) status.textContent = '';
  if (!enabled.length) {
    if (status) status.textContent = 'No tasks selected.';
    return;
  }
  const term = document.getElementById('bi-terminal');
  document.getElementById('bi-output').style.display = 'block';
  document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
  term.textContent = '';
  const runNext = function(idx) {
    if (idx >= enabled.length) {
      if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
      return Promise.resolve();
    }
    const t = enabled[idx];
    term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
    if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
    return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
      .then(function(d) {
        return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
      })
      .then(function() {
        return runNext(idx + 1);
      })
      .catch(function(err) {
        if (status) status.textContent = 'Error: ' + err.message;
        document.getElementById('bi-output').style.display = 'block';
        term.textContent += 'ERROR: ' + err.message + '\n';
        return Promise.reject(err);
      });
  };
  return runNext(0);
 }
 function runPlatformStress() {
  const comps = [];
  const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
  const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
  const hasChecked = function(ids) {
    return ids.some(function(id) {
      const el = document.getElementById(id);
      return el && el.checked && !el.disabled;
    });
  };
  if (hasChecked(computeIDs)) comps.push('cpu');
  if (hasChecked(gpuIDs)) comps.push('gpu');
  if (!comps.length) {
    const status = document.getElementById('burn-all-status');
    if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
    return;
  }
  const extra = comps.length > 0 ? {platform_components: comps} : {};
  enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
    streamTask(d.task_id, 'Platform Thermal Cycling');
  });
 }
 function runAllBurnTasks() {
  const status = document.getElementById('burn-all-status');
  const all = [
    {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
    {id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
    {id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
    {id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
    {id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
    {id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
    {id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
  ];
  status.textContent = 'Enqueuing...';
  runBurnTaskSet(all, 'burn-all-status');
 }
 fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
  const map = {
    'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
    'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
    'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
    'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
  };
  tools.forEach(function(t) {
    const spec = map[t.id];
    if (!spec) return;
    const cb = document.getElementById(spec.cb);
    const note = document.getElementById(spec.note);
    if (!cb) return;
    if (t.available) {
      cb.disabled = false;
    } else if (note) {
      note.textContent = '— ' + spec.reason;
    }
  });
 }).catch(function() {});
 burnLoadGPUs();
 </script>`
 }
--- a/audit/internal/webui/page_export_tools.go
+++ b/audit/internal/webui/page_export_tools.go
@@ -0,0 +1,510 @@
 package webui
 import (
 	"fmt"
 	"html"
 	"net/url"
 	"os"
 	"path/filepath"
 	"sort"
 	"strings"
 )
 func renderExport(exportDir string) string {
 	entries, _ := listExportFiles(exportDir)
 	var rows strings.Builder
 	for _, e := range entries {
 		rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
 			url.QueryEscape(e), html.EscapeString(e)))
 	}
 	if len(entries) == 0 {
 		rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
 	}
 	return `<div class="grid2">
 <div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
 <p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
 ` + renderSupportBundleInline() + `
 </div></div>
 <div class="card"><div class="card-head">Export Files</div><div class="card-body">
 <table><tr><th>File</th></tr>` + rows.String() + `</table>
 </div></div>
 </div>
 ` + renderUSBExportCard()
 }
 func listExportFiles(exportDir string) ([]string, error) {
 	var entries []string
 	err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			return err
 		}
 		if info.IsDir() {
 			return nil
 		}
 		rel, err := filepath.Rel(exportDir, path)
 		if err != nil {
 			return err
 		}
 		entries = append(entries, rel)
 		return nil
 	})
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
 	sort.Strings(entries)
 	return entries, nil
 }
 func renderSupportBundleInline() string {
 	return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">&#8595; Download Support Bundle</button>
 <div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
 <script>
 window.supportBundleDownload = function() {
  var btn = document.getElementById('support-bundle-btn');
  var status = document.getElementById('support-bundle-status');
  btn.disabled = true;
  btn.textContent = 'Building...';
  status.textContent = 'Collecting logs and export data\u2026';
  status.style.color = 'var(--muted)';
  var filename = 'bee-support.tar.gz';
  fetch('/export/support.tar.gz')
    .then(function(r) {
      if (!r.ok) throw new Error('HTTP ' + r.status);
      var cd = r.headers.get('Content-Disposition') || '';
      var m = cd.match(/filename="?([^";]+)"?/);
      if (m) filename = m[1];
      return r.blob();
    })
    .then(function(blob) {
      var url = URL.createObjectURL(blob);
      var a = document.createElement('a');
      a.href = url;
      a.download = filename;
      document.body.appendChild(a);
      a.click();
      document.body.removeChild(a);
      URL.revokeObjectURL(url);
      status.textContent = 'Download started.';
      status.style.color = 'var(--ok-fg)';
    })
    .catch(function(e) {
      status.textContent = 'Error: ' + e.message;
      status.style.color = 'var(--crit-fg)';
    })
    .finally(function() {
      btn.disabled = false;
      btn.textContent = '\u2195 Download Support Bundle';
    });
 };
 </script>`
 }
 func renderUSBExportCard() string {
 	return `<div class="card" style="margin-top:16px">
  <div class="card-head">USB Black-Box
    <button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">&#8635; Refresh</button>
  </div>
  <div class="card-body">` + renderUSBExportInline() + `</div>
 </div>`
 }
 func renderUSBExportInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
 <div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
 <div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
 <div id="usb-targets" style="margin-top:12px"></div>
 <div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
 <script>
 (function(){
 function blackboxRefresh() {
  document.getElementById('usb-status').textContent = 'Scanning...';
  document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
  document.getElementById('usb-targets').innerHTML = '';
  document.getElementById('usb-msg').textContent = '';
  Promise.all([
    fetch('/api/export/usb').then(r=>r.json()),
    fetch('/api/blackbox/status').then(r=>r.json())
  ]).then(function(values) {
    const targets = Array.isArray(values[0]) ? values[0] : [];
    const state = values[1] || {};
    const active = Array.isArray(state.targets) ? state.targets : [];
    window._usbTargets = targets;
    window._blackboxTargets = active;
    const st = document.getElementById('usb-status');
    const ct = document.getElementById('usb-targets');
    const summary = document.getElementById('blackbox-summary');
    if (state.boot_folder) {
      summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
    } else {
      summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
    }
    if (!targets || targets.length === 0) {
      st.textContent = 'No removable USB devices found.';
    } else {
      st.textContent = targets.length + ' device(s) found:';
    }
    const byDevice = {};
    active.forEach(function(item) { byDevice[item.device] = item; });
    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
      targets.map((t, idx) => {
        const dev = t.device || '';
        const label = t.label || '';
        const model = t.model || '';
        const state = byDevice[dev];
        const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
        const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
        return '<tr>' +
          '<td style="font-family:monospace">'+dev+'</td>' +
          '<td>'+t.fs_type+'</td>' +
          '<td>'+t.size+'</td>' +
          '<td>'+label+'</td>' +
          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
          '<td style="font-size:12px">'+status+detail+'</td>' +
          '<td style="white-space:nowrap">' +
            (state
              ? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
              : '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
            '<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
          '</td></tr>';
      }).join('') + '</table>';
  }).catch(e => {
    document.getElementById('usb-status').textContent = 'Error: ' + e;
  });
 }
 window.blackboxEnable = function(targetIndex, btn) {
  const target = (window._usbTargets || [])[targetIndex];
  if (!target) {
    const msg = document.getElementById('usb-msg');
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: USB target not found. Refresh and try again.';
    return;
  }
  const msg = document.getElementById('usb-msg');
  const row = btn ? btn.closest('td') : null;
  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
  const originalText = btn ? btn.textContent : '';
  if (btn) {
    btn.disabled = true;
    btn.textContent = 'Enabling...';
  }
  if (rowMsg) {
    rowMsg.style.color = 'var(--muted)';
    rowMsg.textContent = 'Working...';
  }
  msg.style.color = 'var(--muted)';
  msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
  fetch('/api/blackbox/enable', {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(target)
  }).then(async r => {
    const d = await r.json();
    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
    return d;
  }).then(d => {
    msg.style.color = 'var(--ok,green)';
    msg.textContent = d.message || 'Done.';
    if (rowMsg) {
      rowMsg.style.color = 'var(--ok,green)';
      rowMsg.textContent = d.message || 'Done.';
    }
  }).catch(e => {
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: '+e;
    if (rowMsg) {
      rowMsg.style.color = 'var(--err,red)';
      rowMsg.textContent = 'Error: ' + e;
    }
  }).finally(() => {
    if (btn) {
      btn.disabled = false;
      btn.textContent = originalText;
    }
    setTimeout(blackboxRefresh, 300);
  });
 };
 window.blackboxDisable = function(targetIndex, btn) {
  const target = (window._usbTargets || [])[targetIndex];
  const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
  if (!target || !active) {
    const msg = document.getElementById('usb-msg');
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: black-box target not found. Refresh and try again.';
    return;
  }
  const msg = document.getElementById('usb-msg');
  const row = btn ? btn.closest('td') : null;
  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
  const originalText = btn ? btn.textContent : '';
  if (btn) {
    btn.disabled = true;
    btn.textContent = 'Disabling...';
  }
  if (rowMsg) {
    rowMsg.style.color = 'var(--muted)';
    rowMsg.textContent = 'Working...';
  }
  msg.style.color = 'var(--muted)';
  msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
  fetch('/api/blackbox/disable', {
    method:'POST',
    headers:{'Content-Type':'application/json'},
    body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
  }).then(async r => {
    const d = await r.json();
    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
    return d;
  }).then(d => {
    msg.style.color = 'var(--ok,green)';
    msg.textContent = d.message || 'Done.';
    if (rowMsg) {
      rowMsg.style.color = 'var(--ok,green)';
      rowMsg.textContent = d.message || 'Done.';
    }
  }).catch(e => {
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: '+e;
    if (rowMsg) {
      rowMsg.style.color = 'var(--err,red)';
      rowMsg.textContent = 'Error: ' + e;
    }
  }).finally(() => {
    if (btn) {
      btn.disabled = false;
      btn.textContent = originalText;
    }
    setTimeout(blackboxRefresh, 300);
  });
 };
 window.blackboxRefresh = blackboxRefresh;
 blackboxRefresh();
 })();
 </script>`
 }
 func renderNvidiaSelfHealInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
 <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
  <button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
  <button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">&#8635; Refresh</button>
 </div>
 <div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
 <div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
    <span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
    <span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
  </div>
  <div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
 </div>
 <script>
 function nvidiaSelfHealShowResult(label, status, output) {
  var out = document.getElementById('nvidia-self-heal-out');
  var term = document.getElementById('nvidia-self-heal-terminal');
  var statusEl = document.getElementById('nvidia-self-heal-out-status');
  var labelEl = document.getElementById('nvidia-self-heal-out-label');
  out.style.display = 'block';
  labelEl.textContent = label;
  term.textContent = output || '(no output)';
  term.scrollTop = term.scrollHeight;
  if (status === 'ok') {
    statusEl.textContent = '✓ done';
    statusEl.style.color = 'var(--ok-fg, #2c662d)';
  } else {
    statusEl.textContent = '✗ failed';
    statusEl.style.color = 'var(--crit-fg, #9f3a38)';
  }
 }
 function nvidiaRestartDrivers() {
  var btn = document.getElementById('nvidia-restart-btn');
  var original = btn.textContent;
  btn.disabled = true;
  btn.textContent = 'Restarting...';
  nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
  fetch('/api/services/action', {
    method:'POST',
    headers:{'Content-Type':'application/json'},
    body:JSON.stringify({name:'bee-nvidia', action:'restart'})
  }).then(r=>r.json()).then(d => {
    nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
    setTimeout(function() {
      loadServices();
      loadNvidiaSelfHeal();
    }, 800);
  }).catch(e => {
    nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
  }).finally(() => {
    btn.disabled = false;
    btn.textContent = original;
  });
 }
 function nvidiaResetGPU(index, btn) {
  var original = btn.textContent;
  btn.disabled = true;
  btn.textContent = 'Resetting...';
  nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
  fetch('/api/gpu/nvidia-reset', {
    method:'POST',
    headers:{'Content-Type':'application/json'},
    body:JSON.stringify({index:index})
  }).then(r=>r.json()).then(d => {
    nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
    setTimeout(loadNvidiaSelfHeal, 1000);
  }).catch(e => {
    nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
  }).finally(() => {
    btn.disabled = false;
    btn.textContent = original;
  });
 }
 function loadNvidiaSelfHeal() {
  var status = document.getElementById('nvidia-self-heal-status');
  var table = document.getElementById('nvidia-self-heal-table');
  status.textContent = 'Loading NVIDIA GPU status...';
  status.style.color = 'var(--muted)';
  table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
  fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
    if (!Array.isArray(gpus) || gpus.length === 0) {
      status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
      table.innerHTML = '';
      return;
    }
    status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
    const rows = gpus.map(g => {
      const serial = g.serial || '';
      const bdf = g.bdf || '';
      const id = serial || bdf || ('gpu-' + g.index);
      const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
      const details = [];
      if (serial) details.push('serial ' + serial);
      if (bdf) details.push('bdf ' + bdf);
      if (g.parse_failure && g.raw_line) details.push(g.raw_line);
      return '<tr>'
        + '<td style="white-space:nowrap">' + g.index + '</td>'
        + '<td>' + (g.name || 'unknown') + '</td>'
        + '<td style="font-family:monospace">' + id + '</td>'
        + '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
        + (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
        + '</td>'
        + '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
        + '</tr>';
    }).join('');
    table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
  }).catch(e => {
    status.textContent = 'Error loading NVIDIA GPU status: ' + e;
    status.style.color = 'var(--crit-fg, #9f3a38)';
    table.innerHTML = '';
  });
 }
 loadNvidiaSelfHeal();
 </script>`
 }
 func renderTools() string {
 	return `<div class="card" style="margin-bottom:16px">
  <div class="card-head">System Install</div>
  <div class="card-body">
    <div style="margin-bottom:20px">
    <div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
    <p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
    <p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
    <button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">&#9654; Copy to RAM</button>
    </div>
    <div style="border-top:1px solid var(--line);padding-top:20px">
    <div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
 		renderInstallInline() + `
    </div>
  </div>
 </div>
 <script>
 fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
  const boot = document.getElementById('boot-source-text');
  const txt = document.getElementById('ram-status-text');
  const btn = document.getElementById('ram-install-btn');
  let source = d.device || d.source || 'unknown source';
  let kind = d.kind || 'unknown';
  let label = source;
  if (kind === 'ram') label = 'RAM';
  else if (kind === 'usb') label = 'USB (' + source + ')';
  else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
  else if (kind === 'disk') label = 'disk (' + source + ')';
  else label = source;
  boot.textContent = 'Current boot source: ' + label + '.';
  txt.textContent = d.message || 'Checking...';
  if (d.status === 'ok' || d.in_ram) {
    txt.style.color = 'var(--ok, green)';
  } else if (d.status === 'failed') {
    txt.style.color = 'var(--err, #b91c1c)';
  } else {
    txt.style.color = 'var(--muted)';
  }
  if (d.can_start_task) {
    btn.style.display = '';
    btn.disabled = false;
  } else {
    btn.style.display = 'none';
  }
 });
 function installToRAM() {
  document.getElementById('ram-install-btn').disabled = true;
  fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
    window.location.href = '/tasks#' + d.task_id;
  });
 }
 </script>
 <div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
 <p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
 ` + renderSupportBundleInline() + `
 <div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
  <div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
  ` + renderUSBExportInline() + `
 </div>
 </div></div>
 <div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
 <div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
 <div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
 		renderNvidiaSelfHealInline() + `</div></div>
 <div class="card"><div class="card-head">Network</div><div class="card-body">` +
 		renderNetworkInline() + `</div></div>
 <div class="card"><div class="card-head">Services</div><div class="card-body">` +
 		renderServicesInline() + `</div></div>
 <script>
 function checkTools() {
  document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
  fetch('/api/tools/check').then(r=>r.json()).then(tools => {
    const rows = tools.map(t =>
      '<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '&#10003; '+t.Path : '&#10007; missing')+'</span></td></tr>'
    ).join('');
    document.getElementById('tools-table').innerHTML =
      '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
  });
 }
 checkTools();
 </script>`
 }
 func renderExportIndex(exportDir string) (string, error) {
 	entries, err := listExportFiles(exportDir)
 	if err != nil {
 		return "", err
 	}
 	var body strings.Builder
 	body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
 	body.WriteString(`<h1>Bee Export Files</h1><ul>`)
 	for _, entry := range entries {
 		body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
 	}
 	if len(entries) == 0 {
 		body.WriteString(`<li>No export files found.</li>`)
 	}
 	body.WriteString(`</ul></body></html>`)
 	return body.String(), nil
 }
--- a/audit/internal/webui/page_install_tasks.go
+++ b/audit/internal/webui/page_install_tasks.go
@@ -0,0 +1,314 @@
 package webui
 func renderInstallInline() string {
 	return `
    <div class="alert alert-warn" style="margin-bottom:16px">
      <strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
      disk and write the live system onto it. All existing data on the target disk will be lost.
      This operation cannot be undone.
    </div>
    <div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
    <div id="install-disk-section" style="display:none">
      <div class="card" style="margin-bottom:0">
        <table id="install-disk-table">
          <thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
          <tbody id="install-disk-tbody"></tbody>
        </table>
      </div>
      <div style="margin-top:12px">
        <button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
      </div>
    </div>
    <div id="install-confirm-section" style="display:none;margin-top:20px">
      <div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
      <div class="form-row" style="max-width:360px">
        <label>Type the device name to confirm (e.g. /dev/sda)</label>
        <input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
      </div>
      <button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
      <button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
    </div>
    <div id="install-progress-section" style="display:none;margin-top:20px">
      <div class="card-head" style="margin-bottom:8px">Installation Progress</div>
      <div id="install-terminal" class="terminal" style="max-height:500px"></div>
      <div id="install-status" style="margin-top:12px;font-size:13px"></div>
    </div>
 <style>
 #install-disk-tbody tr{cursor:pointer}
 #install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
 #install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
 </style>
 <script>
 var _installSelected = null;
 function installRefreshDisks() {
  document.getElementById('install-loading').style.display = '';
  document.getElementById('install-disk-section').style.display = 'none';
  document.getElementById('install-confirm-section').style.display = 'none';
  _installSelected = null;
  fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
    document.getElementById('install-loading').style.display = 'none';
    var tbody = document.getElementById('install-disk-tbody');
    tbody.innerHTML = '';
    if (!disks || disks.length === 0) {
      tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
    } else {
      disks.forEach(function(d) {
        var warnings = (d.warnings || []);
        var statusHtml;
        if (warnings.length === 0) {
          statusHtml = '<span class="badge badge-ok">OK</span>';
        } else {
          var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
          statusHtml = warnings.map(function(w){
            var cls = hasSmall ? 'badge-err' : 'badge-warn';
            return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'&quot;') + '">' +
              (w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
          }).join(' ');
        }
        var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
          ? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
        var tr = document.createElement('tr');
        tr.dataset.device = d.device;
        tr.dataset.model = d.model || 'Unknown';
        tr.dataset.size = d.size;
        tr.dataset.warnings = JSON.stringify(warnings);
        tr.innerHTML =
          '<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
          '<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
          '<td>' + (d.model || '—') + '</td>' +
          '<td>' + d.size + '</td>' +
          '<td>' + statusHtml + '</td>';
        tr.addEventListener('click', function(){ installSelectDisk(this); });
        tbody.appendChild(tr);
      });
    }
    document.getElementById('install-disk-section').style.display = '';
  }).catch(function(e){
    document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
  });
 }
 function installSelectDisk(tr) {
  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
  tr.classList.add('selected');
  var radio = tr.querySelector('input[type=radio]');
  if (radio) radio.checked = true;
  _installSelected = {
    device: tr.dataset.device,
    model: tr.dataset.model,
    size: tr.dataset.size,
    warnings: JSON.parse(tr.dataset.warnings || '[]')
  };
  var warnBox = document.getElementById('install-confirm-warn');
  var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
    ' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
    ' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
  if (_installSelected.warnings.length > 0) {
    warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
  }
  warnBox.innerHTML = warnLines;
  document.getElementById('install-confirm-input').value = '';
  document.getElementById('install-start-btn').disabled = true;
  document.getElementById('install-confirm-section').style.display = '';
  document.getElementById('install-progress-section').style.display = 'none';
 }
 function installDeselect() {
  _installSelected = null;
  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
  document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
  document.getElementById('install-confirm-section').style.display = 'none';
 }
 function installCheckConfirm() {
  var val = document.getElementById('install-confirm-input').value.trim();
  var ok = _installSelected && val === _installSelected.device;
  document.getElementById('install-start-btn').disabled = !ok;
 }
 function installStart() {
  if (!_installSelected) return;
  document.getElementById('install-confirm-section').style.display = 'none';
  document.getElementById('install-disk-section').style.display = 'none';
  document.getElementById('install-loading').style.display = 'none';
  var prog = document.getElementById('install-progress-section');
  var term = document.getElementById('install-terminal');
  var status = document.getElementById('install-status');
  prog.style.display = '';
  term.textContent = '';
  status.textContent = 'Starting installation…';
  status.style.color = 'var(--muted)';
  fetch('/api/install/run', {
    method: 'POST',
    headers: {'Content-Type': 'application/json'},
    body: JSON.stringify({device: _installSelected.device})
  }).then(function(r){
    return r.json().then(function(j){
      if (!r.ok) throw new Error(j.error || r.statusText);
      return j;
    });
  }).then(function(j){
    if (!j.task_id) throw new Error('missing task id');
    installStreamLog(j.task_id);
  }).catch(function(e){
    status.textContent = 'Error: ' + e;
    status.style.color = 'var(--crit-fg)';
  });
 }
 function installStreamLog(taskId) {
  var term = document.getElementById('install-terminal');
  var status = document.getElementById('install-status');
  var es = new EventSource('/api/tasks/' + taskId + '/stream');
  es.onmessage = function(e) {
    term.textContent += e.data + '\n';
    term.scrollTop = term.scrollHeight;
  };
  es.addEventListener('done', function(e) {
    es.close();
    if (!e.data) {
      status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
      var rebootBtn = document.createElement('button');
      rebootBtn.className = 'btn btn-primary btn-sm';
      rebootBtn.style.marginLeft = '12px';
      rebootBtn.textContent = 'Reboot now';
      rebootBtn.onclick = function(){
        fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
          body: JSON.stringify({name:'', action:'reboot'})});
      };
      status.appendChild(rebootBtn);
    } else {
      status.textContent = '✗ Installation failed: ' + e.data;
      status.style.color = 'var(--crit-fg)';
    }
  });
  es.onerror = function() {
    es.close();
    status.textContent = '✗ Stream disconnected.';
    status.style.color = 'var(--crit-fg)';
  };
 }
 installRefreshDisks();
 </script>
 `
 }
 func renderInstall() string {
 	return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
 		renderInstallInline() +
 		`</div></div>`
 }
 func renderTasks() string {
 	return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
 <button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
 <button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
 <span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
 <span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
 </div>
 <div class="card">
 <div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
 </div>
 <script>
 var _taskRefreshTimer = null;
 var _tasksAll = [];
 var _taskPage = 1;
 var _taskPageSize = 50;
 function loadTasks() {
  fetch('/api/tasks').then(r=>r.json()).then(tasks => {
    _tasksAll = Array.isArray(tasks) ? tasks : [];
    if (_tasksAll.length === 0) {
      _taskPage = 1;
      document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
      return;
    }
    const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
    if (_taskPage > totalPages) _taskPage = totalPages;
    if (_taskPage < 1) _taskPage = 1;
    const start = (_taskPage - 1) * _taskPageSize;
    const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
    const rows = pageTasks.map(t => {
      const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
      const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
      const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status;
      let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
      if (t.status === 'running' || t.status === 'pending') {
        actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
      }
      if (t.status === 'pending') {
        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">&#8679;</button>';
        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">&#8681;</button>';
      }
      return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
        '<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
        '<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
        '<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
        '<td>'+t.priority+'</td>' +
        '<td>'+actions+'</td></tr>';
    }).join('');
    const showingFrom = start + 1;
    const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
    const pager =
      '<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
        '<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
        '<div style="display:flex;align-items:center;gap:8px">' +
          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
          '<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
        '</div>' +
      '</div>';
    document.getElementById('tasks-table').innerHTML =
      '<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
  });
 }
 function escHtml(s) { return (s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/"/g,'&quot;'); }
 function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
 function formatDurSec(sec) {
  sec = Math.max(0, Math.round(sec||0));
  if (sec < 60) return sec+'s';
  const m = Math.floor(sec/60), ss = sec%60;
  return m+'m '+ss+'s';
 }
 function setTaskPage(page) {
  const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
  _taskPage = Math.min(totalPages, Math.max(1, page));
  loadTasks();
 }
 function cancelTask(id) {
  fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
 }
 function cancelAll() {
  fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
 }
 function killWorkers() {
  if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
  fetch('/api/tasks/kill-workers',{method:'POST'})
    .then(r=>r.json())
    .then(d=>{
      loadTasks();
      var toast = document.getElementById('kill-toast');
      var parts = [];
      if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
      if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
      toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
      toast.style.display = '';
      setTimeout(()=>{ toast.style.display='none'; }, 5000);
    });
 }
 function setPriority(id, delta) {
  fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
    .then(()=>loadTasks());
 }
 loadTasks();
 _taskRefreshTimer = setInterval(loadTasks, 2000);
 </script>`
 }
--- a/audit/internal/webui/page_metrics.go
+++ b/audit/internal/webui/page_metrics.go
@@ -0,0 +1,238 @@
 package webui
 func renderMetrics() string {
 	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Server — Load</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Temperature — CPU</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Temperature — Ambient Sensors</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Server — Power</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
  </div>
 </div>
 <div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
  <div class="card-head">Server — Fan RPM</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
  </div>
 </div>
 <section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
  <div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
    <div>
      <div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
      <div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
    </div>
    <label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
      <input id="gpu-chart-toggle" type="checkbox">
      <span>One chart per GPU</span>
    </label>
  </div>
  <div id="gpu-metrics-by-metric">
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Compute Load</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Memory Load</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Core Clock</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Power</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Temperature</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
      </div>
    </div>
  </div>
  <div id="gpu-metrics-by-gpu" style="display:none"></div>
 </section>
 <script>
 let gpuChartKey = '';
 const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
 let metricsNvidiaGPUsPromise = null;
 function loadMetricsNvidiaGPUs() {
  if (!metricsNvidiaGPUsPromise) {
    metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
      .then(function(r) {
        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
        return r.json();
      })
      .then(function(list) { return Array.isArray(list) ? list : []; })
      .catch(function() { return []; });
  }
  return metricsNvidiaGPUsPromise;
 }
 function metricsGPUNameMap(list) {
  const out = {};
  (list || []).forEach(function(gpu) {
    const idx = Number(gpu.index);
    if (!Number.isFinite(idx) || !gpu.name) return;
    out[idx] = gpu.name;
  });
  return out;
 }
 function metricsGPUDisplayLabel(idx, names) {
  const name = names && names[idx];
  return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
 }
 function loadGPUChartModePreference() {
  try {
    return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
  } catch (_) {
    return false;
  }
 }
 function saveGPUChartModePreference(perGPU) {
  try {
    sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
  } catch (_) {}
 }
 function refreshChartImage(el) {
  if (!el || el.dataset.loading === '1') return;
  if (el.offsetParent === null) return;
  const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
  const nextSrc = baseSrc + '?t=' + Date.now();
  const probe = new Image();
  el.dataset.baseSrc = baseSrc;
  el.dataset.loading = '1';
  probe.onload = function() {
    el.src = nextSrc;
    el.dataset.loading = '0';
  };
  probe.onerror = function() {
    el.dataset.loading = '0';
  };
  probe.src = nextSrc;
 }
 function refreshCharts() {
  document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
 }
 function gpuIndices(rows) {
  const seen = {};
  const out = [];
  (rows || []).forEach(function(row) {
    const idx = Number(row.index);
    if (!Number.isFinite(idx) || seen[idx]) return;
    seen[idx] = true;
    out.push(idx);
  });
  return out.sort(function(a, b) { return a - b; });
 }
 function renderGPUOverviewCards(indices, names) {
  const host = document.getElementById('gpu-metrics-by-gpu');
  if (!host) return;
  host.innerHTML = indices.map(function(idx) {
    const label = metricsGPUDisplayLabel(idx, names);
    return '<div class="card" style="margin-bottom:16px">' +
      '<div class="card-head">' + label + ' — Overview</div>' +
      '<div class="card-body" style="padding:8px">' +
      '<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
      '</div></div>';
  }).join('');
 }
 function applyGPUChartMode() {
  const perMetric = document.getElementById('gpu-metrics-by-metric');
  const perGPU = document.getElementById('gpu-metrics-by-gpu');
  const toggle = document.getElementById('gpu-chart-toggle');
  const gpuModePerGPU = !!(toggle && toggle.checked);
  if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
  if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
 }
 function syncMetricsLayout(d) {
  const fanCard = document.getElementById('card-server-fans');
  if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
  const section = document.getElementById('gpu-metrics-section');
  const summary = document.getElementById('gpu-metrics-summary');
  const indices = gpuIndices(d.gpus);
  loadMetricsNvidiaGPUs().then(function(gpus) {
    const names = metricsGPUNameMap(gpus);
    if (section) section.style.display = indices.length > 0 ? '' : 'none';
    if (summary) {
      summary.textContent = indices.length > 0
        ? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
        : 'No GPUs detected in live metrics.';
    }
    const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
    if (nextKey !== gpuChartKey) {
      renderGPUOverviewCards(indices, names);
      gpuChartKey = nextKey;
    }
    applyGPUChartMode();
  });
 }
 function loadMetricsLayout() {
  fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
 }
 const gpuChartToggle = document.getElementById('gpu-chart-toggle');
 if (gpuChartToggle) {
  gpuChartToggle.checked = loadGPUChartModePreference();
 }
 applyGPUChartMode();
 if (gpuChartToggle) {
  gpuChartToggle.addEventListener('change', function() {
    saveGPUChartModePreference(!!gpuChartToggle.checked);
    applyGPUChartMode();
    refreshCharts();
  });
 }
 loadMetricsLayout();
 setInterval(refreshCharts, 3000);
 setInterval(loadMetricsLayout, 5000);
 </script>`
 }
--- a/audit/internal/webui/page_network_services.go
+++ b/audit/internal/webui/page_network_services.go
@@ -0,0 +1,213 @@
 package webui
 import "html"
 // renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
 func renderNetworkInline() string {
 	return `<div id="net-pending" style="display:none" class="alert alert-warn">
 <strong>&#9888; Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
 <button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
 <button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
 </div>
 <div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div class="grid2" style="margin-top:16px">
 <div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
 <div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
 <button class="btn btn-primary" onclick="runDHCP()">&#9654; Run DHCP</button>
 <div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
 </div>
 <div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
 <div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
 <div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
 <div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
 <div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
 <div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
 <button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
 <div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
 </div>
 </div>
 <script>
 var _netCountdownTimer = null;
 var _netRefreshTimer = null;
 const NET_ROLLBACK_SECS = 60;
 function loadNetwork() {
  fetch('/api/network').then(r=>r.json()).then(d => {
    const rows = (d.interfaces||[]).map(i =>
      '<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
      '<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
      '<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
    ).join('');
    document.getElementById('iface-table').innerHTML =
      '<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
      (d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
    if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
    else hideNetPending();
  }).catch(function() {});
 }
 function selectIface(iface) {
  document.getElementById('dhcp-iface').value = iface;
  document.getElementById('st-iface').value = iface;
 }
 function toggleIface(iface, currentState) {
  showNetPending(NET_ROLLBACK_SECS);
  fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
    .then(r=>r.json()).then(d => {
      if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
      loadNetwork();
      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
    }).catch(function() {
      setTimeout(loadNetwork, 1500);
    });
 }
 function hideNetPending() {
  const el = document.getElementById('net-pending');
  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
  _netCountdownTimer = null;
  el.style.display = 'none';
 }
 function showNetPending(secs) {
  if (!secs || secs < 1) { hideNetPending(); return; }
  const el = document.getElementById('net-pending');
  el.style.display = 'block';
  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
  let remaining = secs;
  document.getElementById('net-countdown').textContent = remaining;
  _netCountdownTimer = setInterval(function() {
    remaining--;
    document.getElementById('net-countdown').textContent = remaining;
    if (remaining <= 0) { hideNetPending(); loadNetwork(); }
  }, 1000);
 }
 function confirmNetChange() {
  hideNetPending();
  fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
 }
 function rollbackNetChange() {
  hideNetPending();
  fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
 }
 function runDHCP() {
  const iface = document.getElementById('dhcp-iface').value.trim();
  showNetPending(NET_ROLLBACK_SECS);
  fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
    .then(r=>r.json()).then(d => {
      document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
      if (d.error) { hideNetPending(); return; }
      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
      loadNetwork();
    }).catch(function() {
      setTimeout(loadNetwork, 1500);
    });
 }
 function setStatic() {
  const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
  showNetPending(NET_ROLLBACK_SECS);
  fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
    interface: document.getElementById('st-iface').value,
    address: document.getElementById('st-addr').value,
    prefix: document.getElementById('st-prefix').value,
    gateway: document.getElementById('st-gw').value,
    dns: dns,
  })}).then(r=>r.json()).then(d => {
    document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
    if (d.error) { hideNetPending(); return; }
    showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
    loadNetwork();
  }).catch(function() {
    setTimeout(loadNetwork, 1500);
  });
 }
 loadNetwork();
 if (_netRefreshTimer) clearInterval(_netRefreshTimer);
 _netRefreshTimer = setInterval(loadNetwork, 5000);
 </script>`
 }
 func renderNetwork() string {
 	return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
 		renderNetworkInline() +
 		`</div></div>`
 }
 func renderServicesInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
 <div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
 <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="svc-out" style="display:none;margin-top:12px">
  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
    <span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
    <span id="svc-out-status" style="font-size:12px"></span>
  </div>
  <div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
 </div>
 <script>
 function loadServices() {
  fetch('/api/services').then(r=>r.json()).then(svcs => {
    const rows = svcs.map(s => {
      const st = s.state||'unknown';
      const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
      const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
      const body = (s.body||'').replace(/</g,'&lt;').replace(/>/g,'&gt;');
      return '<tr>' +
        '<td style="white-space:nowrap">'+s.name+'</td>' +
        '<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
        '<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
        '</td>' +
        '<td style="white-space:nowrap">' +
        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start"   onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop"    onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
        '</td></tr>';
    }).join('');
    document.getElementById('svc-table').innerHTML =
      '<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
  });
 }
 function toggleBody(id) {
  const el = document.getElementById(id);
  if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
 }
 function svcAction(btn, name, action) {
  var label = btn.textContent;
  btn.disabled = true;
  btn.textContent = '...';
  var out = document.getElementById('svc-out');
  var term = document.getElementById('svc-terminal');
  var statusEl = document.getElementById('svc-out-status');
  var labelEl = document.getElementById('svc-out-label');
  out.style.display = 'block';
  labelEl.textContent = action + ' ' + name;
  term.textContent = 'Running...';
  statusEl.textContent = '';
  statusEl.style.color = '';
  fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
    .then(r=>r.json()).then(d => {
      term.textContent = d.output || d.error || '(no output)';
      term.scrollTop = term.scrollHeight;
      if (d.status === 'ok') {
        statusEl.textContent = '✓ done';
        statusEl.style.color = 'var(--ok-fg, #2c662d)';
      } else {
        statusEl.textContent = '✗ failed';
        statusEl.style.color = 'var(--crit-fg, #9f3a38)';
      }
      btn.textContent = label;
      btn.disabled = false;
      setTimeout(loadServices, 800);
    }).catch(e => {
      term.textContent = 'Request failed: ' + e;
      statusEl.textContent = '✗ error';
      statusEl.style.color = 'var(--crit-fg, #9f3a38)';
      btn.textContent = label;
      btn.disabled = false;
    });
 }
 loadServices();
 </script>`
 }
 func renderServices() string {
 	return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
 		renderServicesInline() +
 		`</div></div>`
 }
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -0,0 +1,663 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"sort"
 	"strings"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/schema"
 )
 type validateInventory struct {
 	CPU            string
 	Memory         string
 	Storage        string
 	NVIDIA         string
 	AMD            string
 	NvidiaGPUCount int
 	AMDGPUCount    int
 }
 func validateFmtDur(secs int) string {
 	if secs < 120 {
 		return fmt.Sprintf("~%d s", secs)
 	}
 	mins := (secs + 29) / 60
 	return fmt.Sprintf("~%d min", mins)
 }
 func validateTotalValidateSec(n int) int {
 	if n < 0 {
 		n = 0
 	}
 	total := platform.SATEstimatedCPUValidateSec +
 		platform.SATEstimatedMemoryValidateSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUValidateSec
 	}
 	return total
 }
 func validateTotalStressSec(n int) int {
 	if n < 0 {
 		n = 0
 	}
 	total := platform.SATEstimatedCPUStressSec +
 		platform.SATEstimatedMemoryStressSec +
 		platform.SATEstimatedNvidiaPulseTestSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUStressSec +
 			platform.SATEstimatedNvidiaTargetedStressSec +
 			platform.SATEstimatedNvidiaTargetedPowerSec
 	}
 	return total
 }
 func renderValidate(opts HandlerOptions) string {
 	inv := loadValidateInventory(opts)
 	n := inv.NvidiaGPUCount
 	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
 	stressTotalStr := validateFmtDur(validateTotalStressSec(n))
 	gpuNote := ""
 	if n > 0 {
 		gpuNote = fmt.Sprintf(" (%d GPU)", n)
 	}
 	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 	<div class="card" style="margin-bottom:16px">
 	  <div class="card-head">Validate Profile</div>
 	  <div class="card-body validate-profile-body">
 	    <div class="validate-profile-col">
 	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
 	    </div>
 	    <div class="validate-profile-col validate-profile-action">
 	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
 	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
 	      <div style="margin-top:12px">
 	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
 	      </div>
 	    </div>
 	  </div>
 	</div>
 <div class="grid3">
 ` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
 		inv.CPU,
 		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
 		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
 		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
 	)) +
 		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
 			inv.Memory,
 			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
 			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
 			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
 			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
 			`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">NVIDIA GPU Selection</div>
  <div class="card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
    </div>
    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
    </div>
    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
  </div>
 </div>
 <div class="grid3">
 ` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
 		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
 			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
 			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
 		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
 		renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
 		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
 		renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
 			`<code>dcgmi diag pulse_test</code>`,
 			`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-interconnect">` +
 		renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
 			`<code>all_reduce_perf</code> (NCCL tests)`,
 			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-bandwidth">` +
 		renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
 			`<code>nvbandwidth</code>`,
 			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
 		)) +
 		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
 ` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
 		inv.AMD,
 		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
 		`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
 		`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
 	)) +
 		`</div>
 <div id="sat-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Test Output <span id="sat-title"></span></div>
  <div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
 </div>
 <style>
 .validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
 .validate-profile-col { min-width:0; display:flex; flex-direction:column; }
 .validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
 .validate-card-body { padding:0; }
 .validate-card-section { padding:12px 16px 0; }
 .validate-card-section:last-child { padding-bottom:16px; }
 .sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
 .sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
 </style>
 <script>
 let satES = null;
 function satStressMode() {
  return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
 }
 function satModeChanged() {
  const stress = satStressMode();
  [
    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
      card.style.opacity = stress ? '1' : '0.5';
      const hint = document.getElementById(item.hint);
      if (hint) hint.style.display = stress ? 'none' : '';
    }
  });
 }
 function satLabels() {
  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
  if (!satNvidiaGPUsPromise) {
    satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
      .then(r => {
        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
        return r.json();
      })
      .then(list => Array.isArray(list) ? list : []);
  }
  return satNvidiaGPUsPromise;
 }
 function satSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
    .map(function(el) { return parseInt(el.value, 10); })
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
 function satUpdateGPUSelectionNote() {
  const note = document.getElementById('sat-gpu-selection-note');
  if (!note) return;
  const selected = satSelectedGPUIndices();
  if (!selected.length) {
    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
    return;
  }
  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
 }
 function satRenderGPUList(gpus) {
  const root = document.getElementById('sat-gpu-list');
  if (!root) return;
  if (!gpus || !gpus.length) {
    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
    satUpdateGPUSelectionNote();
    return;
  }
  root.innerHTML = gpus.map(function(gpu) {
    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
    return '<label class="sat-gpu-row">'
      + '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  satUpdateGPUSelectionNote();
 }
 function satSelectAllGPUs() {
  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
  satUpdateGPUSelectionNote();
 }
 function satSelectNoGPUs() {
  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
  satUpdateGPUSelectionNote();
 }
 function satLoadGPUs() {
  loadSatNvidiaGPUs().then(function(gpus) {
    satRenderGPUList(gpus);
  }).catch(function(err) {
    const root = document.getElementById('sat-gpu-list');
    if (root) {
      root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
    }
    satUpdateGPUSelectionNote();
  });
 }
 function satGPUDisplayName(gpu) {
  const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
  const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
  return 'GPU ' + idx + ' — ' + name;
 }
 function satRequestBody(target, overrides) {
  const body = {};
  const labels = satLabels();
  body.display_name = labels[target] || ('Validate ' + target);
  body.stress_mode = satStressMode();
  if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
  if (overrides) {
    Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
  }
  return body;
 }
 function enqueueSATTarget(target, overrides) {
  return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
    .then(r => r.json());
 }
 function streamSATTask(taskId, title, resetTerminal) {
  if (satES) { satES.close(); satES = null; }
  document.getElementById('sat-output').style.display='block';
  document.getElementById('sat-title').textContent = '— ' + title;
  const term = document.getElementById('sat-terminal');
  if (resetTerminal) {
    term.textContent = '';
  }
  term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
  return new Promise(function(resolve) {
    satES = new EventSource('/api/tasks/' + taskId + '/stream');
    satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
    satES.addEventListener('done', function(e) {
      satES.close();
      satES = null;
      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
      term.scrollTop = term.scrollHeight;
      resolve({ok: !e.data, error: e.data || ''});
    });
    satES.onerror = function() {
      if (satES) {
        satES.close();
        satES = null;
      }
      term.textContent += '\nERROR: stream disconnected.\n';
      term.scrollTop = term.scrollHeight;
      resolve({ok: false, error: 'stream disconnected'});
    };
  });
 }
 function selectedAMDValidateTargets() {
  const targets = [];
  const gpu = document.getElementById('sat-amd-target');
  const mem = document.getElementById('sat-amd-mem-target');
  const bw = document.getElementById('sat-amd-bandwidth-target');
  if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
  if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
  if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
  return targets;
 }
 function runSAT(target) {
  return runSATWithOverrides(target, null);
 }
 function runSATWithOverrides(target, overrides) {
  const title = (overrides && overrides.display_name) || target;
  const term = document.getElementById('sat-terminal');
  document.getElementById('sat-output').style.display='block';
  document.getElementById('sat-title').textContent = '— ' + title;
  term.textContent = 'Enqueuing ' + title + ' test...\n';
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
 const nvidiaPerGPUTargets = [];
 const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
  return Promise.resolve(satSelectedGPUIndices());
 }
 function expandSATTarget(target) {
  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
    return satAllGPUIndicesForMulti().then(function(indices) {
      if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
      return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
    });
  }
  if (nvidiaPerGPUTargets.indexOf(target) < 0) {
    return Promise.resolve([{target: target}]);
  }
  const selected = satSelectedGPUIndices();
  if (!selected.length) {
    return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
  }
  return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
    target: target,
    overrides: {
      gpu_indices: [Number(gpu.index)],
      display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
    },
    label: satGPUDisplayName(gpu),
  })));
 }
 function runNvidiaFabricValidate(target) {
  satAllGPUIndicesForMulti().then(function(indices) {
    if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
    runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
  });
 }
 function runNvidiaValidateSet(target) {
  const selected = satSelectedGPUIndices();
  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
 }
 function runAMDValidateSet() {
  const targets = selectedAMDValidateTargets();
  if (!targets.length) return;
  if (targets.length === 1) return runSAT(targets[0]);
  document.getElementById('sat-output').style.display='block';
  document.getElementById('sat-title').textContent = '— amd';
  const term = document.getElementById('sat-terminal');
  term.textContent = 'Running AMD validate set one by one...\n';
  const labels = satLabels();
  const runNext = (idx) => {
    if (idx >= targets.length) return Promise.resolve();
    const target = targets[idx];
    term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
    return enqueueSATTarget(target)
      .then(d => {
        return streamSATTask(d.task_id, labels[target], false);
      }).then(function() {
        return runNext(idx + 1);
      });
  };
  return runNext(0);
 }
 function runAllSAT() {
  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
    const btn = document.getElementById('sat-btn-' + target);
    return !(btn && btn.disabled);
  });
  Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
    const expanded = [];
    for (let cycle = 0; cycle < cycles; cycle++) {
      groups.forEach(group => group.forEach(item => expanded.push(item)));
    }
    const total = expanded.length;
    let enqueued = 0;
    if (!total) {
      status.textContent = 'No tasks selected.';
      return;
    }
    const runNext = (idx) => {
      if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
      const item = expanded[idx];
      status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
      return enqueueSATTarget(item.target, item.overrides)
        .then(() => {
          enqueued++;
          return runNext(idx + 1);
        });
    };
    return runNext(0);
  }).catch(err => {
    status.textContent = 'Error: ' + err.message;
  });
 }
 </script>
 <script>
 fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
 });
 satLoadGPUs();
 function disableSATAMDOptions(reason) {
    ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
        const cb = document.getElementById(id);
        if (!cb) return;
        cb.disabled = true;
        cb.checked = false;
        cb.title = reason;
    });
 }
 function disableSATCard(id, reason) {
    const btn = document.getElementById('sat-btn-' + id);
    if (!btn) return;
    btn.disabled = true;
    btn.title = reason;
    btn.style.opacity = '0.4';
    const card = btn.closest('.card');
    if (card) {
        let note = card.querySelector('.sat-unavail');
        if (!note) {
            note = document.createElement('p');
            note.className = 'sat-unavail';
            note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
            const body = card.querySelector('.card-body');
            if (body) body.insertBefore(note, body.firstChild);
        }
        note.textContent = reason;
    }
 }
 </script>`
 }
 func loadValidateInventory(opts HandlerOptions) validateInventory {
 	unknown := "Audit snapshot not loaded."
 	out := validateInventory{
 		CPU:     unknown,
 		Memory:  unknown,
 		Storage: unknown,
 		NVIDIA:  unknown,
 		AMD:     unknown,
 	}
 	data, err := loadSnapshot(opts.AuditPath)
 	if err != nil {
 		return out
 	}
 	var snap schema.HardwareIngestRequest
 	if err := json.Unmarshal(data, &snap); err != nil {
 		return out
 	}
 	cpuCounts := map[string]int{}
 	cpuTotal := 0
 	for _, cpu := range snap.Hardware.CPUs {
 		if cpu.Present != nil && !*cpu.Present {
 			continue
 		}
 		cpuTotal++
 		addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
 	}
 	memCounts := map[string]int{}
 	memTotal := 0
 	for _, dimm := range snap.Hardware.Memory {
 		if dimm.Present != nil && !*dimm.Present {
 			continue
 		}
 		memTotal++
 		addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
 	}
 	storageCounts := map[string]int{}
 	storageTotal := 0
 	for _, dev := range snap.Hardware.Storage {
 		if dev.Present != nil && !*dev.Present {
 			continue
 		}
 		storageTotal++
 		addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
 	}
 	nvidiaCounts := map[string]int{}
 	nvidiaTotal := 0
 	amdCounts := map[string]int{}
 	amdTotal := 0
 	for _, dev := range snap.Hardware.PCIeDevices {
 		if dev.Present != nil && !*dev.Present {
 			continue
 		}
 		if validateIsVendorGPU(dev, "nvidia") {
 			nvidiaTotal++
 			addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
 		}
 		if validateIsVendorGPU(dev, "amd") {
 			amdTotal++
 			addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
 		}
 	}
 	out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
 	out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
 	out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
 	out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
 	out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
 	out.NvidiaGPUCount = nvidiaTotal
 	out.AMDGPUCount = amdTotal
 	return out
 }
 func renderValidateCardBody(devices, description, commands, settings string) string {
 	return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
 		`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
 		`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
 		`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
 }
 func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
 	if total == 0 {
 		return "0 " + unit + "s detected."
 	}
 	keys := make([]string, 0, len(models))
 	for key := range models {
 		keys = append(keys, key)
 	}
 	sort.Strings(keys)
 	parts := make([]string, 0, len(keys))
 	for _, key := range keys {
 		parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
 	}
 	label := unit
 	if total != 1 {
 		label += "s"
 	}
 	if len(parts) == 1 {
 		return parts[0] + " " + label
 	}
 	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
 }
 func addValidateModel(counts map[string]int, name string) {
 	name = strings.TrimSpace(name)
 	if name == "" {
 		name = "unknown"
 	}
 	counts[name]++
 }
 func validateTrimPtr(value *string) string {
 	if value == nil {
 		return ""
 	}
 	return strings.TrimSpace(*value)
 }
 func validateFirstNonEmpty(values ...string) string {
 	for _, value := range values {
 		value = strings.TrimSpace(value)
 		if value != "" {
 			return value
 		}
 	}
 	return ""
 }
 func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
 	model := strings.ToLower(validateTrimPtr(dev.Model))
 	manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
 	class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
 	if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
 		return false
 	}
 	switch vendor {
 	case "nvidia":
 		return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
 	case "amd":
 		isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
 		isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
 		isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
 		return isGPUClass && (isAMDVendor || isAMDModel)
 	default:
 		return false
 	}
 }
 func renderSATCard(id, label, runAction, headerActions, body string) string {
 	actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
 	if strings.TrimSpace(headerActions) != "" {
 		actions += headerActions
 	}
 	return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
 		label, actions, body)
 }
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/serial_console.go
+++ b/audit/internal/webui/serial_console.go
@@ -0,0 +1,41 @@
 package webui
 import (
 	"fmt"
 	"os"
 	"strings"
 	"time"
 )
 var taskSerialWriteLine = writeTaskSerialLine
 func writeTaskSerialLine(line string) {
 	line = strings.TrimSpace(line)
 	if line == "" {
 		return
 	}
 	payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
 	for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
 		f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
 		if err != nil {
 			continue
 		}
 		_, _ = f.WriteString(payload)
 		_ = f.Close()
 		return
 	}
 }
 func taskSerialPrefix(t *Task) string {
 	if t == nil {
 		return "[task] "
 	}
 	return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
 }
 func taskSerialEvent(t *Task, event string) {
 	if t == nil {
 		return
 	}
 	taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
 }
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -1,15 +1,19 @@
 package webui
 import (
 	"bufio"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"html"
 	"io"
 	"log/slog"
 	"mime"
 	"net"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -18,7 +22,6 @@ import (
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/runtimeenv"
 	gocharts "github.com/go-analyze/charts"
 	"reanimator/chart/viewer"
 	"reanimator/chart/web"
 )
@@ -132,6 +135,14 @@ type namedMetricsRing struct {
 // At metricsCollectInterval = 5 s this covers 30 minutes of live history.
 const metricsChartWindow = 360
 // metricsDownsampleAge is the age after which old metrics rows are downsampled
 // to 1 sample per minute. Data fresher than this is kept at full resolution.
 const metricsDownsampleAge = 2 * time.Hour
 // metricsRetainWindow is the total retention period for metrics rows.
 // Rows older than this are deleted entirely by the background compactor.
 const metricsRetainWindow = 48 * time.Hour
 var metricsCollectInterval = 5 * time.Second
 // pendingNetChange tracks a network state change awaiting confirmation.
@@ -218,6 +229,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// ── Infrastructure ──────────────────────────────────────────────────────
 	mux.HandleFunc("GET /healthz", h.handleHealthz)
 	mux.HandleFunc("GET /api/ready", h.handleReady)
 	mux.HandleFunc("GET /loading", func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Cache-Control", "no-store")
 		w.Header().Set("Content-Type", "text/html; charset=utf-8")
 		_, _ = w.Write([]byte(loadingPageHTML))
 	})
 	// ── Existing read-only endpoints (preserved for compatibility) ──────────
 	mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
@@ -234,6 +250,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// SAT
 	mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
 	mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
 	mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
 	mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
 	mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
 	mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
 	mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
 	mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
 	mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
 	mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -247,6 +269,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
 	mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
 	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -255,6 +282,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
 	mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
 	mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
 	mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
 	mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
 	mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
 	// Services
 	mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
@@ -271,18 +301,18 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Export
 	mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
 	mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
-	mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
+	mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
-	mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
+	mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
 	mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)
 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
 	// Display
 	mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
 	mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
 	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
 	mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
 	mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
 	mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
 	// System
@@ -309,22 +339,33 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("GET /", h.handlePage)
 	h.mux = mux
-	return mux
+	return recoverMiddleware(mux)
 }
 func (h *handler) startMetricsCollector() {
-	go func() {
+	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
-		for range ticker.C {
+		pruneTicker := time.NewTicker(time.Hour)
-			sample := platform.SampleLiveMetrics()
+		defer pruneTicker.Stop()
-			if h.metricsDB != nil {
+		for {
-				_ = h.metricsDB.Write(sample)
+			select {
 			case <-ticker.C:
 				sample := platform.SampleLiveMetrics()
 				if h.metricsDB != nil {
 					_ = h.metricsDB.Write(sample)
 				}
 				h.feedRings(sample)
 				h.setLatestMetric(sample)
 			case <-pruneTicker.C:
 				if h.metricsDB != nil {
 					now := time.Now().UTC()
 					_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
 					_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
 				}
 			}
 			h.feedRings(sample)
 			h.setLatestMetric(sample)
 		}
-	}()
+	})
 }
 func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
@@ -345,7 +386,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
 // ListenAndServe starts the HTTP server.
 func ListenAndServe(addr string, opts HandlerOptions) error {
-	return http.ListenAndServe(addr, NewHandler(opts))
+	srv := &http.Server{
 		Addr:              addr,
 		Handler:           NewHandler(opts),
 		ReadHeaderTimeout: 5 * time.Second,
 		ReadTimeout:       30 * time.Second,
 		IdleTimeout:       2 * time.Minute,
 	}
 	return srv.ListenAndServe()
 }
 type trackingResponseWriter struct {
 	http.ResponseWriter
 	wroteHeader bool
 }
 func (w *trackingResponseWriter) WriteHeader(statusCode int) {
 	w.wroteHeader = true
 	w.ResponseWriter.WriteHeader(statusCode)
 }
 func (w *trackingResponseWriter) Write(p []byte) (int, error) {
 	w.wroteHeader = true
 	return w.ResponseWriter.Write(p)
 }
 func (w *trackingResponseWriter) Flush() {
 	w.wroteHeader = true
 	if f, ok := w.ResponseWriter.(http.Flusher); ok {
 		f.Flush()
 	}
 }
 func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
 	h, ok := w.ResponseWriter.(http.Hijacker)
 	if !ok {
 		return nil, nil, fmt.Errorf("hijacking not supported")
 	}
 	return h.Hijack()
 }
 func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
 	p, ok := w.ResponseWriter.(http.Pusher)
 	if !ok {
 		return http.ErrNotSupported
 	}
 	return p.Push(target, opts)
 }
 func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
 	rf, ok := w.ResponseWriter.(io.ReaderFrom)
 	if !ok {
 		return io.Copy(w.ResponseWriter, r)
 	}
 	w.wroteHeader = true
 	return rf.ReadFrom(r)
 }
 func recoverMiddleware(next http.Handler) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		tw := &trackingResponseWriter{ResponseWriter: w}
 		defer func() {
 			if rec := recover(); rec != nil {
 				slog.Error("http handler panic",
 					"method", r.Method,
 					"path", r.URL.Path,
 					"panic", fmt.Sprint(rec),
 					"stack", string(debug.Stack()),
 				)
 				if !tw.wroteHeader {
 					http.Error(tw, "internal server error", http.StatusInternalServerError)
 				}
 			}
 		}()
 		next.ServeHTTP(tw, r)
 	})
 }
 // ── Infrastructure handlers ──────────────────────────────────────────────────
@@ -475,13 +590,60 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
+	samples, err := h.metricsDB.LoadAll()
 	if err != nil || len(samples) == 0 {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
 	timeline := metricsTimelineSegments(samples, time.Now())
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
 		var overviewOk bool
 		var buf []byte
 		buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
 		if !overviewOk {
 			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 			return
 		}
 		w.Header().Set("Content-Type", "image/svg+xml")
 		w.Header().Set("Cache-Control", "no-store")
 		_, _ = w.Write(buf)
 		return
 	}
 	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
-	buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
+	var buf []byte
 	if stacked {
 		buf, err = renderStackedMetricChartSVG(
 			title,
 			labels,
 			sampleTimes(samples),
 			datasets,
 			names,
 			yMax,
 			chartCanvasHeightForPath(path, len(names)),
 			timeline,
 		)
 	} else {
 		buf, err = renderMetricChartSVG(
 			title,
 			labels,
 			sampleTimes(samples),
 			datasets,
 			names,
 			yMin,
 			yMax,
 			chartCanvasHeightForPath(path, len(names)),
 			timeline,
 		)
 	}
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -491,20 +653,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }
-func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
+func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
-	samples, err := h.metricsDB.LoadAll()
+	labels = sampleTimeLabels(samples)
 	if err != nil || len(samples) == 0 {
 		return nil, nil, nil, "", nil, nil, false
 	}
 	return chartDataFromSamples(path, samples)
 }
 func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
 	var datasets [][]float64
 	var names []string
 	var title string
 	var yMin, yMax *float64
 	labels := sampleTimeLabels(samples)
 	switch {
 	case path == "server-load":
@@ -541,12 +691,19 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case path == "server-power":
 		title = "System Power"
 		power := make([]float64, len(samples))
 		label := "Power W"
 		for i, s := range samples {
 			power[i] = s.PowerW
 			if strings.TrimSpace(s.PowerSource) != "" {
 				label = fmt.Sprintf("Power W · %s", s.PowerSource)
 				if strings.TrimSpace(s.PowerMode) != "" {
 					label += fmt.Sprintf(" (%s)", s.PowerMode)
 				}
 			}
 		}
 		power = normalizePowerSeries(power)
 		datasets = [][]float64{power}
-		names = []string{"Power W"}
+		names = []string{label}
 		yMin = floatPtr(0)
 		yMax = autoMax120(power)
@@ -578,42 +735,66 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		yMin = floatPtr(0)
 		yMax = autoMax120(datasets...)
 	case path == "gpu-all-clock":
 		title = "GPU Core Clock"
 		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 		yMin, yMax = autoBounds120(datasets...)
 	case path == "gpu-all-memclock":
 		title = "GPU Memory Clock"
 		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 		yMin, yMax = autoBounds120(datasets...)
 	case strings.HasPrefix(path, "gpu/"):
-		rest := strings.TrimPrefix(path, "gpu/")
+		idx, sub, ok := parseGPUChartPath(path)
-		sub := ""
+		if !ok {
-		if i := strings.LastIndex(rest, "-"); i > 0 {
+			return nil, nil, nil, "", nil, nil, false, false
 			sub = rest[i+1:]
 			rest = rest[:i]
 		}
 		idx := 0
 		fmt.Sscanf(rest, "%d", &idx)
 		switch sub {
 		case "load":
-			title = fmt.Sprintf("GPU %d Load", idx)
+			title = gpuDisplayLabel(idx) + " Load"
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
 			names = []string{"Load %", "Mem %"}
 			yMin = floatPtr(0)
 			yMax = floatPtr(100)
 		case "temp":
-			title = fmt.Sprintf("GPU %d Temperature", idx)
+			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{temp}
 			names = []string{"Temp °C"}
 			yMin = floatPtr(0)
 			yMax = autoMax120(temp)
 		case "clock":
 			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
 				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Core Clock MHz"}
 			yMin, yMax = autoBounds120(clock)
 		case "memclock":
 			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
 				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Memory Clock MHz"}
 			yMin, yMax = autoBounds120(clock)
 		default:
-			title = fmt.Sprintf("GPU %d Power", idx)
+			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{power}
 			names = []string{"Power W"}
@@ -621,10 +802,30 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		}
 	default:
-		return nil, nil, nil, "", nil, nil, false
+		return nil, nil, nil, "", nil, nil, false, false
 	}
-	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
+	return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
 }
 func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
 	if !strings.HasPrefix(path, "gpu/") {
 		return 0, "", false
 	}
 	rest := strings.TrimPrefix(path, "gpu/")
 	if rest == "" {
 		return 0, "", false
 	}
 	sub = ""
 	if i := strings.LastIndex(rest, "-"); i > 0 {
 		sub = rest[i+1:]
 		rest = rest[:i]
 	}
 	n, err := fmt.Sscanf(rest, "%d", &idx)
 	if err != nil || n != 1 {
 		return 0, "", false
 	}
 	return idx, sub, true
 }
 func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
@@ -719,7 +920,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
 			continue
 		}
 		datasets = append(datasets, ds)
-		names = append(names, fmt.Sprintf("GPU %d", idx))
+		names = append(names, gpuDisplayLabel(idx))
 	}
 	return datasets, names
 }
@@ -770,6 +971,37 @@ func normalizePowerSeries(ds []float64) []float64 {
 	return out
 }
 // psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
 func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
 	seen := map[int]struct{}{}
 	for _, s := range samples {
 		for _, p := range s.PSUs {
 			seen[p.Slot] = struct{}{}
 		}
 	}
 	slots := make([]int, 0, len(seen))
 	for s := range seen {
 		slots = append(slots, s)
 	}
 	sort.Ints(slots)
 	return slots
 }
 // psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
 func psuStackedTotal(datasets [][]float64) []float64 {
 	if len(datasets) == 0 {
 		return nil
 	}
 	n := len(datasets[0])
 	total := make([]float64, n)
 	for _, ds := range datasets {
 		for i, v := range ds {
 			total[i] += v
 		}
 	}
 	return total
 }
 func normalizeFanSeries(ds []float64) []float64 {
 	if len(ds) == 0 {
 		return nil
@@ -852,64 +1084,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
 	return floatPtr(low), floatPtr(high)
 }
-// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
+func gpuChartLabelIndices(total, target int) []int {
-func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
+	if total <= 0 {
-	n := len(labels)
+		return nil
 	if n == 0 {
 		n = 1
 		labels = []string{""}
 	}
-	for i := range datasets {
+	if total == 1 {
-		if len(datasets[i]) == 0 {
+		return []int{0}
 			datasets[i] = make([]float64, n)
 		}
 	}
-	// Append global min/avg/max to title.
+	step := total / target
-	mn, avg, mx := globalStats(datasets)
+	if step < 1 {
-	if mx > 0 {
+		step = 1
 		title = fmt.Sprintf("%s    ↓%s  ~%s  ↑%s",
 			title,
 			chartLegendNumber(mn),
 			chartLegendNumber(avg),
 			chartLegendNumber(mx),
 		)
 	}
-	title = sanitizeChartText(title)
+	var indices []int
-	names = sanitizeChartTexts(names)
+	for i := 0; i < total; i += step {
-	sparse := sanitizeChartTexts(sparseLabels(labels, 6))
+		indices = append(indices, i)
 	}
 	if indices[len(indices)-1] != total-1 {
 		indices = append(indices, total-1)
 	}
 	return indices
 }
-	opt := gocharts.NewLineChartOptionWithData(datasets)
+func chartCanvasHeightForPath(path string, seriesCount int) int {
-	opt.Title = gocharts.TitleOption{Text: title}
+	height := chartCanvasHeight(seriesCount)
-	opt.XAxis.Labels = sparse
+	if isGPUChartPath(path) {
-	opt.Legend = gocharts.LegendOption{SeriesNames: names}
+		return height * 2
 	if chartLegendVisible(len(names)) {
 		opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
 		opt.Legend.OverlayChart = gocharts.Ptr(false)
 	} else {
 		opt.Legend.Show = gocharts.Ptr(false)
 	}
 	opt.Symbol = gocharts.SymbolNone
 	// Right padding: reserve space for the MarkLine label (library recommendation).
 	opt.Padding = gocharts.NewBox(20, 20, 80, 20)
 	if yMin != nil || yMax != nil {
 		opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
 	}
 	return height
 }
-	// Add a single peak mark line on the series that holds the global maximum.
+func isGPUChartPath(path string) bool {
-	peakIdx, _ := globalPeakSeries(datasets)
+	return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
 	if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
 		opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
 	}
 	p := gocharts.NewPainter(gocharts.PainterOptions{
 		OutputFormat: gocharts.ChartOutputSVG,
 		Width:        1400,
 		Height:       chartCanvasHeight(len(names)),
 	}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
 	if err := p.LineChart(opt); err != nil {
 		return nil, err
 	}
 	return p.Bytes()
 }
 func chartLegendVisible(seriesCount int) bool {
@@ -923,30 +1128,6 @@ func chartCanvasHeight(seriesCount int) int {
 	return 288
 }
 func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
 	return gocharts.YAxisOption{
 		Min:            yMin,
 		Max:            yMax,
 		LabelCount:     11,
 		ValueFormatter: chartYAxisNumber,
 	}
 }
 // globalPeakSeries returns the index of the series containing the global maximum
 // value across all datasets, and that maximum value.
 func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
 	idx = -1
 	for i, ds := range datasets {
 		for _, v := range ds {
 			if v > peak {
 				peak = v
 				idx = i
 			}
 		}
 	}
 	return idx, peak
 }
 // globalStats returns min, average, and max across all values in all datasets.
 func globalStats(datasets [][]float64) (mn, avg, mx float64) {
 	var sum float64
@@ -986,21 +1167,6 @@ func sanitizeChartText(s string) string {
 	}, s))
 }
 func sanitizeChartTexts(in []string) []string {
 	out := make([]string, len(in))
 	for i, s := range in {
 		out[i] = sanitizeChartText(s)
 	}
 	return out
 }
 func safeIdx(s []float64, i int) float64 {
 	if i < len(s) {
 		return s[i]
 	}
 	return 0
 }
 func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
 	var datasets [][]float64
 	var names []string
@@ -1087,20 +1253,6 @@ func chartYAxisNumber(v float64) string {
 	return out
 }
 func sparseLabels(labels []string, n int) []string {
 	out := make([]string, len(labels))
 	step := len(labels) / n
 	if step < 1 {
 		step = 1
 	}
 	for i, l := range labels {
 		if i%step == 0 {
 			out[i] = l
 		}
 	}
 	return out
 }
 func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
 	if h.metricsDB == nil {
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
@@ -1116,6 +1268,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
 func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Cache-Control", "no-store")
 	if strings.TrimSpace(h.opts.AuditPath) == "" {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte("ready"))
 		return
 	}
 	if _, err := os.Stat(h.opts.AuditPath); err != nil {
 		w.WriteHeader(http.StatusServiceUnavailable)
 		_, _ = w.Write([]byte("starting"))
@@ -1129,37 +1286,106 @@ const loadingPageHTML = `<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="UTF-8">
-<title>EASY-BEE</title>
+<title>EASY-BEE — Starting</title>
 <style>
 *{margin:0;padding:0;box-sizing:border-box}
 html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
-.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
+.wrap{text-align:center;width:420px}
-.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
+.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
 .subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
 .spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
 .spinner.hidden{display:none}
@keyframes spin{to{transform:rotate(360deg)}}
-.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
+.status{font-size:13px;color:#a0aec0;margin-bottom:20px;min-height:18px}
 table{width:100%;border-collapse:collapse;font-size:12px;margin-bottom:20px;display:none}
 td{padding:3px 6px;text-align:left}
 td:first-child{color:#718096;width:55%}
 .ok{color:#68d391}
 .run{color:#f6c90e}
 .fail{color:#fc8181}
 .dim{color:#4a5568}
 .btn{background:#1a202c;color:#a0aec0;border:1px solid #2d3748;padding:7px 18px;font-size:12px;cursor:pointer;font-family:inherit;display:none}
 .btn:hover{border-color:#718096;color:#e2e8f0}
 </style>
 </head>
 <body>
-<div style="text-align:center">
+<div class="wrap">
  <div class="logo">  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗
  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝
  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗
  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝
  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗
  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝</div>
-  <div class="spinner"></div>
+  <div class="subtitle">Hardware Audit LiveCD</div>
-  <div class="status" id="s">Starting up...</div>
+  <div class="spinner" id="spin"></div>
  <div class="status" id="st">Connecting to bee-web...</div>
  <table id="tbl"></table>
  <button class="btn" id="btn" onclick="go()">Open app now</button>
 </div>
 <script>
-function probe(){
+(function(){
-  fetch('/api/ready',{cache:'no-store'})
+var gone = false;
-    .then(function(r){
+function go(){ if(!gone){gone=true;window.location.replace('/');} }
-      if(r.ok){window.location.replace('/');}
+
-      else{setTimeout(probe,1000);}
+function icon(s){
  if(s==='active')   return '<span class="ok">&#9679; active</span>';
  if(s==='failed')   return '<span class="fail">&#10005; failed</span>';
  if(s==='activating'||s==='reloading') return '<span class="run">&#9675; starting</span>';
  if(s==='inactive') return '<span class="dim">&#9675; inactive</span>';
  return '<span class="dim">'+s+'</span>';
 }
 function allSettled(svcs){
  for(var i=0;i<svcs.length;i++){
    var s=svcs[i].state;
    if(s!=='active'&&s!=='failed'&&s!=='inactive') return false;
  }
  return true;
 }
 var pollTimer=null;
 function pollServices(){
  fetch('/api/services',{cache:'no-store'})
    .then(function(r){return r.json();})
    .then(function(svcs){
      if(!svcs||!svcs.length) return;
      var tbl=document.getElementById('tbl');
      tbl.style.display='';
      var html='';
      for(var i=0;i<svcs.length;i++)
        html+='<tr><td>'+svcs[i].name+'</td><td>'+icon(svcs[i].state)+'</td></tr>';
      tbl.innerHTML=html;
      if(allSettled(svcs)){
        clearInterval(pollTimer);
        document.getElementById('spin').className='spinner hidden';
        document.getElementById('st').textContent='Ready \u2014 opening...';
        setTimeout(go,800);
      }
    })
-    .catch(function(){setTimeout(probe,1000);});
+    .catch(function(){});
 }
 function probe(){
  fetch('/healthz',{cache:'no-store'})
    .then(function(r){
      if(r.ok){
        document.getElementById('st').textContent='bee-web running \u2014 checking services...';
        document.getElementById('btn').style.display='';
        pollServices();
        pollTimer=setInterval(pollServices,1500);
      } else {
        document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
        setTimeout(probe,500);
      }
    })
    .catch(function(){
      document.getElementById('st').textContent='Waiting for bee-web to start...';
      setTimeout(probe,500);
    });
 }
 probe();
 })();
 </script>
 </body>
 </html>`
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -1,6 +1,7 @@
 package webui
 import (
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -10,6 +11,7 @@ import (
 	"time"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/schema"
 )
 func TestChartLegendNumber(t *testing.T) {
@@ -34,6 +36,59 @@ func TestChartLegendNumber(t *testing.T) {
 	}
 }
 func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
 	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		panic("boom")
 	}))
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/panic", nil)
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusInternalServerError {
 		t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
 	}
 	if !strings.Contains(rec.Body.String(), "internal server error") {
 		t.Fatalf("body=%q", rec.Body.String())
 	}
 }
 func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
 	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if !sseStart(w) {
 			return
 		}
 		if !sseWrite(w, "tick", "ok") {
 			t.Fatal("expected sse write to succeed")
 		}
 	}))
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/stream", nil)
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
 		t.Fatalf("content-type=%q", got)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
 		t.Fatalf("body=%q", body)
 	}
 }
 func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
 	row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
 	if row.Status != "WARNING" {
 		t.Fatalf("status=%q want WARNING", row.Status)
 	}
 	if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
 		t.Fatalf("issue=%q", row.Issue)
 	}
 }
 func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
@@ -65,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 		},
 	}
-	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -109,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 		},
 	}
-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -136,6 +191,39 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 	}
 }
 func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
 			Timestamp: time.Now().Add(-2 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, ClockMHz: 1400},
 				{GPUIndex: 3, ClockMHz: 1500},
 			},
 		},
 		{
 			Timestamp: time.Now().Add(-1 * time.Minute),
 			GPUs: []platform.GPUMetricRow{
 				{GPUIndex: 0, ClockMHz: 1410},
 				{GPUIndex: 3, ClockMHz: 1510},
 			},
 		},
 	}
 	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
 	if title != "GPU Core Clock" {
 		t.Fatalf("title=%q", title)
 	}
 	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
 		t.Fatalf("names=%v", names)
 	}
 	if got := datasets[1][1]; got != 1510 {
 		t.Fatalf("GPU 3 core clock=%v want 1510", got)
 	}
 }
 func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
 	want := []float64{0, 480, 480, 480, 510, 510}
@@ -157,6 +245,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
 	if !strings.Contains(body, "el.dataset.loading === '1'") {
 		t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
 	}
 	if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
 		t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
 	}
 	if !strings.Contains(body, `id="gpu-chart-toggle"`) {
 		t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
 	}
 	if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
 		t.Fatalf("metrics page should include GPU core clock chart: %s", body)
 	}
 	if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
 		t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
 	}
 	if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
 		t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
 	}
 }
 func TestChartLegendVisible(t *testing.T) {
@@ -199,6 +302,167 @@ func TestChartCanvasHeight(t *testing.T) {
 	}
 }
 func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	end := start.Add(10 * time.Minute)
 	taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
 		s := start.Add(offsetStart)
 		e := start.Add(offsetEnd)
 		return Task{
 			Name:      "task",
 			Status:    TaskDone,
 			StartedAt: &s,
 			DoneAt:    &e,
 		}
 	}
 	segments := chartTimelineSegmentsForRange(start, end, end, []Task{
 		taskWindow(1*time.Minute, 3*time.Minute),
 		taskWindow(2*time.Minute, 5*time.Minute),
 		taskWindow(7*time.Minute, 8*time.Minute),
 	})
 	if len(segments) != 5 {
 		t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
 	}
 	wantActive := []bool{false, true, false, true, false}
 	wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
 	for i, segment := range segments {
 		if segment.Active != wantActive[i] {
 			t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
 		}
 		if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
 			t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
 		}
 		if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
 			t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
 		}
 	}
 }
 func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	labels := []string{"12:00", "12:01", "12:02"}
 	times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
 	svg, err := renderMetricChartSVG(
 		"System Power",
 		labels,
 		times,
 		[][]float64{{300, 320, 310}},
 		[]string{"Power W"},
 		floatPtr(0),
 		floatPtr(400),
 		360,
 		[]chartTimelineSegment{
 			{Start: start, End: start.Add(time.Minute), Active: false},
 			{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
 		},
 	)
 	if err != nil {
 		t.Fatal(err)
 	}
 	body := string(svg)
 	if !strings.Contains(body, `data-role="timeline-overlay"`) {
 		t.Fatalf("svg missing timeline overlay: %s", body)
 	}
 	if !strings.Contains(body, `opacity="0.10"`) {
 		t.Fatalf("svg missing idle overlay opacity: %s", body)
 	}
 	if !strings.Contains(body, `System Power`) {
 		t.Fatalf("svg missing chart title: %s", body)
 	}
 }
 func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
 	dir := t.TempDir()
 	db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
 	if err != nil {
 		t.Fatal(err)
 	}
 	t.Cleanup(func() { _ = db.db.Close() })
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	for i, sample := range []platform.LiveMetricSample{
 		{Timestamp: start, PowerW: 300},
 		{Timestamp: start.Add(time.Minute), PowerW: 320},
 		{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
 	} {
 		if err := db.Write(sample); err != nil {
 			t.Fatalf("write sample %d: %v", i, err)
 		}
 	}
 	globalQueue.mu.Lock()
 	prevTasks := globalQueue.tasks
 	s := start.Add(30 * time.Second)
 	e := start.Add(90 * time.Second)
 	globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = prevTasks
 		globalQueue.mu.Unlock()
 	})
 	h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
 	rec := httptest.NewRecorder()
 	req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
 	h.handleMetricsChartSVG(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `data-role="timeline-overlay"`) {
 		t.Fatalf("custom svg response missing timeline overlay: %s", body)
 	}
 	if !strings.Contains(body, `stroke-linecap="round"`) {
 		t.Fatalf("custom svg response missing custom polyline styling: %s", body)
 	}
 }
 func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	samples := []platform.LiveMetricSample{
 		{
 			Timestamp: start,
 			PSUs: []platform.PSUReading{
 				{Slot: 1, PowerW: 120},
 				{Slot: 2, PowerW: 130},
 			},
 			PowerW:      250,
 			PowerSource: "sdr_psu_input",
 			PowerMode:   "autotuned",
 		},
 		{
 			Timestamp: start.Add(time.Minute),
 			PSUs: []platform.PSUReading{
 				{Slot: 1, PowerW: 140},
 				{Slot: 2, PowerW: 135},
 			},
 			PowerW:      275,
 			PowerSource: "sdr_psu_input",
 			PowerMode:   "autotuned",
 		},
 	}
 	datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
 	if !ok {
 		t.Fatal("expected server-power chart data")
 	}
 	if title != "System Power" {
 		t.Fatalf("title=%q", title)
 	}
 	if stacked {
 		t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
 	}
 	if len(datasets) != 1 || len(names) != 1 {
 		t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
 	}
 	if names[0] != "Power W · sdr_psu_input (autotuned)" {
 		t.Fatalf("names=%v", names)
 	}
 }
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -212,21 +476,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	}
 }
 func TestChartYAxisOption(t *testing.T) {
 	min := floatPtr(0)
 	max := floatPtr(100)
 	opt := chartYAxisOption(min, max)
 	if opt.Min != min || opt.Max != max {
 		t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
 	}
 	if opt.LabelCount != 11 {
 		t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
 	}
 	if got := opt.ValueFormatter(1000); got != "1к" {
 		t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
 	}
 }
 func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
 	r1 := newMetricsRing(4)
 	r2 := newMetricsRing(4)
@@ -335,7 +584,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `Run Audit`) {
+	if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
 		t.Fatalf("dashboard missing run audit button: %s", body)
 	}
 	if strings.Contains(body, `No audit data`) {
@@ -343,6 +592,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
 	}
 }
 func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	if strings.TrimSpace(rec.Body.String()) != "ready" {
 		t.Fatalf("body=%q want ready", rec.Body.String())
 	}
 }
 func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
@@ -365,7 +626,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	}
 }
-func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
+func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
@@ -373,8 +634,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `id="task-log-overlay"`) {
+	if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
-		t.Fatalf("tasks page missing log modal overlay: %s", body)
+		t.Fatalf("tasks page missing task report hint: %s", body)
 	}
 	if !strings.Contains(body, `_taskPageSize = 50`) {
 		t.Fatalf("tasks page missing pagination size config: %s", body)
@@ -384,7 +645,7 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
 	}
 }
-func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
+func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
@@ -392,54 +653,332 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `NVIDIA Self Heal`) {
 		t.Fatalf("tools page missing nvidia self heal section: %s", body)
 	}
 	if !strings.Contains(body, `Restart GPU Drivers`) {
 		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
 	}
-	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
+	if !strings.Contains(body, `nvidiaRestartDrivers()`) {
-		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
+		t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
 	}
 	if !strings.Contains(body, `/api/gpu/nvidia-status`) {
 		t.Fatalf("tools page missing nvidia status api usage: %s", body)
 	}
 	if !strings.Contains(body, `nvidiaResetGPU(`) {
 		t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
 	}
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
 	}
-	if !strings.Contains(body, `Export to USB`) {
+	if !strings.Contains(body, `USB Black-Box`) {
-		t.Fatalf("tools page missing export to usb section: %s", body)
+		t.Fatalf("tools page missing usb black-box section: %s", body)
 	}
-	if !strings.Contains(body, `Support Bundle</button>`) {
+	if !strings.Contains(body, `/api/blackbox/status`) {
-		t.Fatalf("tools page missing support bundle usb button: %s", body)
+		t.Fatalf("tools page missing black-box status api usage: %s", body)
 	}
 }
-func TestTasksPageRendersScrollableLogModal(t *testing.T) {
+func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
-	dir := t.TempDir()
+	handler := NewHandler(HandlerOptions{})
 	path := filepath.Join(dir, "audit.json")
 	exportDir := filepath.Join(dir, "export")
 	if err := os.MkdirAll(exportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	handler := NewHandler(HandlerOptions{
 		Title:     "Bee Hardware Audit",
 		AuditPath: path,
 		ExportDir: exportDir,
 	})
 	rec := httptest.NewRecorder()
-	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `height:calc(100vh - 32px)`) {
+	for _, needle := range []string{
-		t.Fatalf("tasks page missing bounded log modal height: %s", body)
+		`href="/benchmark"`,
 		`id="benchmark-gpu-list"`,
 		`/api/gpu/nvidia`,
 		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
 		`/api/bee-bench/nvidia/autotune/run`,
 		`/api/bee-bench/nvidia/autotune/status`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
 		`Autotune`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
 		}
 	}
-	if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
+}
-		t.Fatalf("tasks page missing log modal overflow guard: %s", body)
+
 func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	dir := t.TempDir()
 	exportDir := filepath.Join(dir, "export")
 	runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		t.Fatal(err)
 	}
-	if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
+	result := platform.NvidiaBenchmarkResult{
-		t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
 		BenchmarkProfile: "standard",
 		OverallStatus:    "OK",
 		GPUs: []platform.BenchmarkGPUResult{
 			{
 				Index: 0,
 				Name:  "NVIDIA H100 PCIe",
 				Scores: platform.BenchmarkScorecard{
 					CompositeScore: 1176.25,
 				},
 			},
 			{
 				Index: 1,
 				Name:  "NVIDIA H100 PCIe",
 				Scores: platform.BenchmarkScorecard{
 					CompositeScore: 1168.50,
 				},
 			},
 		},
 	}
 	raw, err := json.Marshal(result)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
 		t.Fatal(err)
 	}
 	handler := NewHandler(HandlerOptions{ExportDir: exportDir})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
 	for _, needle := range []string{
 		`Perf Results`,
 		`Composite score by saved benchmark run and GPU.`,
 		`GPU 0`,
 		`GPU 1`,
 		`#1`,
 		wantTime,
 		`1176.25`,
 		`1168.50`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA GPU Targeted Stress`,
 		`nvidia-targeted-stress`,
 		`controlled NVIDIA DCGM load`,
 		`<code>dcgmi diag targeted_stress</code>`,
 		`NVIDIA GPU Selection`,
 		`All NVIDIA validate tasks use only the GPUs selected here.`,
 		`Select All`,
 		`id="sat-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA Interconnect (NCCL)`,
 		`Validate and Stress:`,
 		`NVIDIA Bandwidth (NVBandwidth)`,
 		`nvbandwidth runs all built-in tests without a time limit`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
 		`NCCL`,
 		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("burn page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestTaskDetailPageRendersSavedReport(t *testing.T) {
 	dir := t.TempDir()
 	exportDir := filepath.Join(dir, "export")
 	reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
 	if err := os.MkdirAll(reportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	reportPath := filepath.Join(reportDir, "report.html")
 	if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:             "task-1",
 		Name:           "CPU SAT",
 		Target:         "cpu",
 		Status:         TaskDone,
 		CreatedAt:      time.Now(),
 		ArtifactsDir:   reportDir,
 		ReportHTMLPath: reportPath,
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `saved report`) {
 		t.Fatalf("task detail page missing saved report: %s", body)
 	}
 	if !strings.Contains(body, `Back to Tasks`) {
 		t.Fatalf("task detail page missing back link: %s", body)
 	}
 }
 func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:        "task-live-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskRunning,
 		CreatedAt: time.Now(),
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `Cancel</button>`) {
 		t.Fatalf("task detail page missing cancel button: %s", body)
 	}
 	if !strings.Contains(body, `function cancelTaskDetail(id)`) {
 		t.Fatalf("task detail page missing cancel handler: %s", body)
 	}
 	if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
 		t.Fatalf("task detail page missing cancel endpoint: %s", body)
 	}
 	if !strings.Contains(body, `id="task-live-charts"`) {
 		t.Fatalf("task detail page missing live charts container: %s", body)
 	}
 	if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
 		t.Fatalf("task detail page missing live charts index endpoint: %s", body)
 	}
 }
 func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
 	dir := t.TempDir()
 	metricsPath := filepath.Join(dir, "metrics.db")
 	prevMetricsPath := taskReportMetricsDBPath
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
 	db, err := openMetricsDB(metricsPath)
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	base := time.Now().UTC()
 	samples := []platform.LiveMetricSample{
 		{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
 		{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
 		{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
 	}
 	for _, sample := range samples {
 		if err := db.Write(sample); err != nil {
 			t.Fatalf("Write: %v", err)
 		}
 	}
 	_ = db.Close()
 	started := base.Add(-2*time.Minute - 5*time.Second)
 	done := base.Add(-1*time.Minute + 5*time.Second)
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = []*Task{{
 		ID:        "task-chart-1",
 		Name:      "Power Window",
 		Target:    "cpu",
 		Status:    TaskDone,
 		CreatedAt: started.Add(-10 * time.Second),
 		StartedAt: &started,
 		DoneAt:    &done,
 	}}
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 	})
 	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
 	req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
 	req.SetPathValue("id", "task-chart-1")
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, req)
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, "System Power") {
 		t.Fatalf("task chart missing expected title: %s", body)
 	}
 	if !strings.Contains(body, "min 200") {
 		t.Fatalf("task chart stats should start from in-window sample: %s", body)
 	}
 	if strings.Contains(body, "min 100") {
 		t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
 	}
 }
@@ -564,3 +1103,101 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
 		t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
 	}
 }
 func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
 	exportDir := filepath.Join(dir, "export")
 	if err := os.MkdirAll(exportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	health := `{
  "status":"PARTIAL",
  "checked_at":"2026-03-16T10:00:00Z",
  "export_dir":"/tmp/export",
  "driver_ready":true,
  "cuda_ready":false,
  "network_status":"PARTIAL",
  "issues":[
    {"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
    {"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
  ],
  "tools":[
    {"name":"dmidecode","ok":true},
    {"name":"nvidia-smi","ok":false}
  ],
  "services":[
    {"name":"bee-web","status":"active"},
    {"name":"bee-nvidia","status":"inactive"}
  ]
 }`
 	if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
 		t.Fatal(err)
 	}
 	componentStatus := `[
  {
    "component_key":"cpu:all",
    "status":"Warning",
    "error_summary":"cpu SAT: FAILED",
    "history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
  },
  {
    "component_key":"memory:all",
    "status":"OK",
    "history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
  },
  {
    "component_key":"storage:nvme0n1",
    "status":"Critical",
    "error_summary":"storage SAT: FAILED",
    "history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
  },
  {
    "component_key":"pcie:gpu:nvidia",
    "status":"Warning",
    "error_summary":"nvidia SAT: FAILED",
    "history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
  }
 ]`
 	if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
 		t.Fatal(err)
 	}
 	handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		// Runtime Health card — LiveCD checks only
 		`Runtime Health`,
 		`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
 		`Export Directory`,
 		`Network`,
 		`NVIDIA/AMD Driver`,
 		`CUDA / ROCm`,
 		`Required Utilities`,
 		`Bee Services`,
 		`CUDA runtime is not ready for GPU SAT.`,
 		`Missing: nvidia-smi`,
 		`bee-nvidia=inactive`,
 		// Hardware Summary card — component health badges
 		`Hardware Summary`,
 		`>CPU<`,
 		`>Memory<`,
 		`>Storage<`,
 		`>GPU<`,
 		`>PSU<`,
 		`badge-warn`, // cpu Warning badge
 		`badge-err`,  // storage Critical badge
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("dashboard missing %q: %s", needle, body)
 		}
 	}
 }
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -0,0 +1,71 @@
 package webui
 import (
 	"fmt"
 	"log/slog"
 	"runtime/debug"
 	"time"
 )
 const (
 	recoverLoopMaxDelay   = 60 * time.Second
 	recoverLoopResetAfter = 30 * time.Second
 )
 // goRecoverLoop starts fn in a goroutine, restarting after panics.
 // restartDelay is the initial delay; successive panics double it up to
 // recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
 // successfully for recoverLoopResetAfter without panicking.
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
 		delay := restartDelay
 		consecutive := 0
 		for {
 			start := time.Now()
 			panicked := runRecoverable(name, fn)
 			if !panicked {
 				return
 			}
 			consecutive++
 			if time.Since(start) >= recoverLoopResetAfter {
 				delay = restartDelay
 				consecutive = 1
 			}
 			slog.Warn("goroutine restarting after panic",
 				"component", name,
 				"consecutive_panics", consecutive,
 				"next_delay", delay,
 			)
 			if delay > 0 {
 				time.Sleep(delay)
 			}
 			if delay < recoverLoopMaxDelay {
 				delay *= 2
 				if delay > recoverLoopMaxDelay {
 					delay = recoverLoopMaxDelay
 				}
 			}
 		}
 	}()
 }
 func goRecoverOnce(name string, fn func()) {
 	go func() {
 		_ = runRecoverable(name, fn)
 	}()
 }
 func runRecoverable(name string, fn func()) (panicked bool) {
 	defer func() {
 		if rec := recover(); rec != nil {
 			panicked = true
 			slog.Error("recovered panic",
 				"component", name,
 				"panic", fmt.Sprint(rec),
 				"stack", string(debug.Stack()),
 			)
 		}
 	}()
 	fn()
 	return false
 }
--- a/audit/internal/webui/task_page.go
+++ b/audit/internal/webui/task_page.go
@@ -0,0 +1,267 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"net/http"
 	"os"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
 )
 func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
 	id := r.PathValue("id")
 	task, ok := globalQueue.findByID(id)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	snapshot := *task
 	body := renderTaskDetailPage(h.opts, snapshot)
 	w.Header().Set("Cache-Control", "no-store")
 	w.Header().Set("Content-Type", "text/html; charset=utf-8")
 	_, _ = w.Write([]byte(body))
 }
 func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
 	task, samples, _, _, ok := h.taskSamplesForRequest(r)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	type taskChartIndexEntry struct {
 		Title string `json:"title"`
 		File  string `json:"file"`
 	}
 	entries := make([]taskChartIndexEntry, 0)
 	for _, spec := range taskChartSpecsForSamples(samples) {
 		title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
 		if !ok {
 			continue
 		}
 		entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
 	}
 	w.Header().Set("Cache-Control", "no-store")
 	w.Header().Set("Content-Type", "application/json; charset=utf-8")
 	_ = json.NewEncoder(w).Encode(entries)
 }
 func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
 	task, samples, _, _, ok := h.taskSamplesForRequest(r)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
 	path, ok := taskChartPathFromFile(file)
 	if !ok {
 		http.NotFound(w, r)
 		return
 	}
 	title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
 	if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
 	w.Header().Set("Content-Type", "image/svg+xml")
 	w.Header().Set("Cache-Control", "no-store")
 	_, _ = w.Write(buf)
 }
 func renderTaskDetailPage(opts HandlerOptions, task Task) string {
 	title := task.Name
 	if strings.TrimSpace(title) == "" {
 		title = task.ID
 	}
 	var body strings.Builder
 	body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
 	body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
 	if task.Status == TaskRunning || task.Status == TaskPending {
 		body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
 	}
 	body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
 	body.WriteString(`</div>`)
 	if report := loadTaskReportFragment(task); report != "" {
 		body.WriteString(report)
 	} else {
 		body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
 		body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
 		body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
 		if strings.TrimSpace(task.ErrMsg) != "" {
 			body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
 		}
 		body.WriteString(`</div></div>`)
 	}
 	if task.Status == TaskRunning {
 		body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
 		body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
 		body.WriteString(`</div></div>`)
 	}
 	if task.Status == TaskRunning || task.Status == TaskPending {
 		body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
 		body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
 		body.WriteString(`</div></div>`)
 		body.WriteString(`<script>
 function cancelTaskDetail(id) {
  fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
    var term = document.getElementById('task-live-log');
    if (term) {
      term.textContent += '\nCancel requested.\n';
      term.scrollTop = term.scrollHeight;
    }
  });
 }
 function renderTaskLiveCharts(taskId, charts) {
  const host = document.getElementById('task-live-charts');
  if (!host) return;
  if (!Array.isArray(charts) || charts.length === 0) {
    host.innerHTML = 'Waiting for metric samples...';
    return;
  }
  const seen = {};
  charts.forEach(function(chart) {
    seen[chart.file] = true;
    let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
    if (img) {
      const card = img.closest('.card');
      if (card) {
        const title = card.querySelector('.card-head');
        if (title) title.textContent = chart.title;
      }
      return;
    }
    const card = document.createElement('div');
    card.className = 'card';
    card.style.margin = '0';
    card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
    card.querySelector('.card-head').textContent = chart.title;
    const body = card.querySelector('.card-body');
    img = document.createElement('img');
    img.setAttribute('data-task-chart', '1');
    img.setAttribute('data-chart-file', chart.file);
    img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
    img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
    img.style.width = '100%';
    img.style.display = 'block';
    img.style.borderRadius = '6px';
    img.alt = chart.title;
    body.appendChild(img);
    host.appendChild(card);
  });
  Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
    const file = img.getAttribute('data-chart-file') || '';
    if (seen[file]) return;
    const card = img.closest('.card');
    if (card) card.remove();
  });
 }
 function loadTaskLiveCharts(taskId) {
  fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
    renderTaskLiveCharts(taskId, charts);
  }).catch(function(){
    const host = document.getElementById('task-live-charts');
    if (host) host.innerHTML = 'Task charts are unavailable.';
  });
 }
 function refreshTaskLiveCharts() {
  document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
    const base = img.dataset.baseSrc;
    if (!base) return;
    img.src = base + '?t=' + Date.now();
  });
 }
 var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
 var _taskDetailTerm = document.getElementById('task-live-log');
 var _taskChartTimer = null;
 var _taskChartsFrozen = false;
 _taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
 _taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
 _taskDetailES.addEventListener('done', function(e){
  if (_taskChartTimer) clearInterval(_taskChartTimer);
  _taskDetailES.close();
  _taskDetailES = null;
  _taskChartsFrozen = true;
  _taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
  _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
  refreshTaskLiveCharts();
 });
 _taskDetailES.onerror = function(){
  if (_taskChartTimer) clearInterval(_taskChartTimer);
  if (_taskDetailES) {
    _taskDetailES.close();
    _taskDetailES = null;
  }
 };
 loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
 _taskChartTimer = setInterval(function(){
  if (_taskChartsFrozen) return;
  loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
  refreshTaskLiveCharts();
 }, 2000);
 </script>`)
 	}
 	return layoutHead(opts.Title+" — "+title) +
 		layoutNav("tasks", opts.BuildLabel) +
 		`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
 		body.String() +
 		`</div></div></body></html>`
 }
 func loadTaskReportFragment(task Task) string {
 	if strings.TrimSpace(task.ReportHTMLPath) == "" {
 		return ""
 	}
 	data, err := os.ReadFile(task.ReportHTMLPath)
 	if err != nil || len(data) == 0 {
 		return ""
 	}
 	return string(data)
 }
 func taskArtifactDownloadLink(task Task, absPath string) string {
 	if strings.TrimSpace(absPath) == "" {
 		return ""
 	}
 	return fmt.Sprintf(`/export/file?path=%s`, absPath)
 }
 func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
 	id := r.PathValue("id")
 	taskPtr, ok := globalQueue.findByID(id)
 	if !ok {
 		return Task{}, nil, time.Time{}, time.Time{}, false
 	}
 	task := *taskPtr
 	start, end := taskTimeWindow(&task)
 	samples, err := loadTaskMetricSamples(start, end)
 	if err != nil {
 		return task, nil, start, end, true
 	}
 	return task, samples, start, end, true
 }
 func taskTimelineForTask(task Task) []chartTimelineSegment {
 	start, end := taskTimeWindow(&task)
 	return []chartTimelineSegment{{Start: start, End: end, Active: true}}
 }
 func taskChartPathFromFile(file string) (string, bool) {
 	file = strings.TrimSpace(file)
 	for _, spec := range taskDashboardChartSpecs {
 		if spec.File == file {
 			return spec.Path, true
 		}
 	}
 	if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
 		id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
 		return "gpu/" + id + "-overview", true
 	}
 	return "", false
 }
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -0,0 +1,371 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"os"
 	"path/filepath"
 	"sort"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
 )
 var taskReportMetricsDBPath = metricsDBPath
 type taskReport struct {
 	ID          string            `json:"id"`
 	Name        string            `json:"name"`
 	Target      string            `json:"target"`
 	Status      string            `json:"status"`
 	CreatedAt   time.Time         `json:"created_at"`
 	StartedAt   *time.Time        `json:"started_at,omitempty"`
 	DoneAt      *time.Time        `json:"done_at,omitempty"`
 	DurationSec int               `json:"duration_sec,omitempty"`
 	Error       string            `json:"error,omitempty"`
 	LogFile     string            `json:"log_file,omitempty"`
 	Charts      []taskReportChart `json:"charts,omitempty"`
 	GeneratedAt time.Time         `json:"generated_at"`
 }
 type taskReportChart struct {
 	Title string `json:"title"`
 	File  string `json:"file"`
 }
 type taskChartSpec struct {
 	Path string
 	File string
 }
 var taskDashboardChartSpecs = []taskChartSpec{
 	{Path: "server-load", File: "server-load.svg"},
 	{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
 	{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
 	{Path: "server-power", File: "server-power.svg"},
 	{Path: "server-fans", File: "server-fans.svg"},
 	{Path: "gpu-all-load", File: "gpu-all-load.svg"},
 	{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
 	{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
 	{Path: "gpu-all-power", File: "gpu-all-power.svg"},
 	{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
 }
 func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
 	specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
 	specs = append(specs, taskDashboardChartSpecs...)
 	for _, idx := range taskGPUIndices(samples) {
 		specs = append(specs, taskChartSpec{
 			Path: fmt.Sprintf("gpu/%d-overview", idx),
 			File: fmt.Sprintf("gpu-%d-overview.svg", idx),
 		})
 	}
 	return specs
 }
 func writeTaskReportArtifacts(t *Task) error {
 	if t == nil {
 		return nil
 	}
 	ensureTaskReportPaths(t)
 	if strings.TrimSpace(t.ArtifactsDir) == "" {
 		return nil
 	}
 	if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
 		return err
 	}
 	start, end := taskTimeWindow(t)
 	samples, _ := loadTaskMetricSamples(start, end)
 	charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
 	logText := ""
 	if data, err := os.ReadFile(t.LogPath); err == nil {
 		logText = string(data)
 	}
 	report := taskReport{
 		ID:          t.ID,
 		Name:        t.Name,
 		Target:      t.Target,
 		Status:      t.Status,
 		CreatedAt:   t.CreatedAt,
 		StartedAt:   t.StartedAt,
 		DoneAt:      t.DoneAt,
 		DurationSec: taskElapsedSec(t, reportDoneTime(t)),
 		Error:       t.ErrMsg,
 		LogFile:     filepath.Base(t.LogPath),
 		Charts:      charts,
 		GeneratedAt: time.Now().UTC(),
 	}
 	if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
 		return err
 	}
 	return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
 }
 func reportDoneTime(t *Task) time.Time {
 	if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
 		return *t.DoneAt
 	}
 	return time.Now()
 }
 func taskTimeWindow(t *Task) (time.Time, time.Time) {
 	if t == nil {
 		now := time.Now().UTC()
 		return now, now
 	}
 	start := t.CreatedAt.UTC()
 	if t.StartedAt != nil && !t.StartedAt.IsZero() {
 		start = t.StartedAt.UTC()
 	}
 	end := time.Now().UTC()
 	if t.DoneAt != nil && !t.DoneAt.IsZero() {
 		end = t.DoneAt.UTC()
 	}
 	if end.Before(start) {
 		end = start
 	}
 	return start, end
 }
 func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
 	db, err := openMetricsDB(taskReportMetricsDBPath)
 	if err != nil {
 		return nil, err
 	}
 	defer db.Close()
 	return db.LoadBetween(start, end)
 }
 func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
 	if len(samples) == 0 {
 		return nil, nil
 	}
 	timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
 	var charts []taskReportChart
 	inline := make(map[string]string)
 	for _, spec := range taskChartSpecsForSamples(samples) {
 		title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
 		if !ok || len(svg) == 0 {
 			continue
 		}
 		path := filepath.Join(dir, spec.File)
 		if err := os.WriteFile(path, svg, 0644); err != nil {
 			continue
 		}
 		charts = append(charts, taskReportChart{Title: title, File: spec.File})
 		inline[spec.File] = string(svg)
 	}
 	return charts, inline
 }
 func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
 		buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil || !hasData {
 			return "", nil, false
 		}
 		return gpuDisplayLabel(idx) + " Overview", buf, true
 	}
 	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
 	}
 	var buf []byte
 	var err error
 	if stacked {
 		buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
 	} else {
 		buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
 	}
 	if err != nil {
 		return "", nil, false
 	}
 	return title, buf, true
 }
 func taskGPUIndices(samples []platform.LiveMetricSample) []int {
 	seen := map[int]bool{}
 	var out []int
 	for _, s := range samples {
 		for _, g := range s.GPUs {
 			if seen[g.GPUIndex] {
 				continue
 			}
 			seen[g.GPUIndex] = true
 			out = append(out, g.GPUIndex)
 		}
 	}
 	sort.Ints(out)
 	return out
 }
 func writeJSONFile(path string, v any) error {
 	data, err := json.MarshalIndent(v, "", "  ")
 	if err != nil {
 		return err
 	}
 	return os.WriteFile(path, data, 0644)
 }
 func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
 	b.WriteString(`<div class="grid2">`)
 	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
 	b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
 	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
 	if strings.TrimSpace(report.Error) != "" {
 		b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
 	}
 	b.WriteString(`</div></div>`)
 	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
 	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
 	b.WriteString(`</div></div></div>`)
 	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
 		b.WriteString(benchmarkCard)
 	}
 	if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
 		b.WriteString(powerCard)
 	}
 	if len(report.Charts) > 0 {
 		for _, chart := range report.Charts {
 			b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
 			b.WriteString(charts[chart.File])
 			b.WriteString(`</div></div>`)
 		}
 	} else {
 		b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
 	}
 	b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
 	b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
 	b.WriteString(`</div></div>`)
 	return b.String()
 }
 func renderTaskBenchmarkResultsCard(target, logText string) string {
 	switch strings.TrimSpace(target) {
 	case "nvidia-bench-perf":
 	default:
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
 	if strings.TrimSpace(resultPath) == "" {
 		return ""
 	}
 	columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
 	if len(runs) == 0 {
 		return ""
 	}
 	return renderBenchmarkResultsCardFromRuns(
 		"Perf Results",
 		"Composite score for this benchmark task.",
 		"No benchmark results were saved for this task.",
 		columns,
 		runs,
 	)
 }
 func renderTaskPowerResultsCard(target, logText string) string {
 	if strings.TrimSpace(target) != "nvidia-bench-power" {
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
 	if strings.TrimSpace(resultPath) == "" {
 		return ""
 	}
 	raw, err := os.ReadFile(resultPath)
 	if err != nil {
 		return ""
 	}
 	var result platform.NvidiaPowerBenchResult
 	if err := json.Unmarshal(raw, &result); err != nil {
 		return ""
 	}
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
 	if len(result.RecommendedSlotOrder) > 0 {
 		b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
 	}
 	b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
 			gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
 	}
 	b.WriteString(`</table></div></div>`)
 	return b.String()
 }
 func taskBenchmarkResultPath(logText string) string {
 	archivePath := taskArchivePathFromLog(logText)
 	if archivePath == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	return filepath.Join(runDir, "result.json")
 }
 func taskArchivePathFromLog(logText string) string {
 	lines := strings.Split(logText, "\n")
 	for i := len(lines) - 1; i >= 0; i-- {
 		line := strings.TrimSpace(lines[i])
 		if line == "" || !strings.HasPrefix(line, "Archive:") {
 			continue
 		}
 		path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
 		if strings.HasPrefix(path, "Archive written to ") {
 			path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
 		}
 		if strings.HasSuffix(path, ".tar.gz") {
 			return path
 		}
 	}
 	return ""
 }
 func renderTaskStatusBadge(status string) string {
 	className := map[string]string{
 		TaskRunning:   "badge-ok",
 		TaskPending:   "badge-unknown",
 		TaskDone:      "badge-ok",
 		TaskFailed:    "badge-err",
 		TaskCancelled: "badge-unknown",
 	}[status]
 	if className == "" {
 		className = "badge-unknown"
 	}
 	label := strings.TrimSpace(status)
 	if label == "" {
 		label = "unknown"
 	}
 	return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
 }
 func formatTaskTime(ts *time.Time, fallback time.Time) string {
 	if ts != nil && !ts.IsZero() {
 		return ts.Local().Format("2006-01-02 15:04:05")
 	}
 	if !fallback.IsZero() {
 		return fallback.Local().Format("2006-01-02 15:04:05")
 	}
 	return "n/a"
 }
 func formatTaskDuration(sec int) string {
 	if sec <= 0 {
 		return "n/a"
 	}
 	if sec < 60 {
 		return fmt.Sprintf("%ds", sec)
 	}
 	if sec < 3600 {
 		return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
 	}
 	return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
 }
--- a/audit/internal/webui/task_runner.go
+++ b/audit/internal/webui/task_runner.go
@@ -0,0 +1,505 @@
 package webui
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"log/slog"
 	"os"
 	"os/signal"
 	"path/filepath"
 	"strings"
 	"syscall"
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/runtimeenv"
 )
 type taskRunnerState struct {
 	PID       int       `json:"pid"`
 	Status    string    `json:"status"`
 	Error     string    `json:"error,omitempty"`
 	UpdatedAt time.Time `json:"updated_at"`
 }
 func taskRunnerStatePath(t *Task) string {
 	if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
 		return ""
 	}
 	return filepath.Join(t.ArtifactsDir, "runner-state.json")
 }
 func writeTaskRunnerState(t *Task, state taskRunnerState) error {
 	path := taskRunnerStatePath(t)
 	if path == "" {
 		return nil
 	}
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return err
 	}
 	data, err := json.MarshalIndent(state, "", "  ")
 	if err != nil {
 		return err
 	}
 	tmp := path + ".tmp"
 	if err := os.WriteFile(tmp, data, 0644); err != nil {
 		return err
 	}
 	return os.Rename(tmp, path)
 }
 func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
 	path := taskRunnerStatePath(t)
 	if path == "" {
 		return taskRunnerState{}, false
 	}
 	data, err := os.ReadFile(path)
 	if err != nil || len(data) == 0 {
 		return taskRunnerState{}, false
 	}
 	var state taskRunnerState
 	if err := json.Unmarshal(data, &state); err != nil {
 		return taskRunnerState{}, false
 	}
 	return state, true
 }
 func processAlive(pid int) bool {
 	if pid <= 0 {
 		return false
 	}
 	err := syscall.Kill(pid, 0)
 	return err == nil || err == syscall.EPERM
 }
 func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
 	now := time.Now()
 	t.DoneAt = &now
 	switch {
 	case cancelled:
 		t.Status = TaskCancelled
 		t.ErrMsg = "aborted"
 	case strings.TrimSpace(errMsg) != "":
 		t.Status = TaskFailed
 		t.ErrMsg = errMsg
 	default:
 		t.Status = TaskDone
 		t.ErrMsg = ""
 	}
 }
 func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
 	if opts == nil {
 		j.append("ERROR: handler options not configured")
 		j.finish("handler options not configured")
 		return
 	}
 	a := opts.App
 	recovered := len(j.lines) > 0
 	j.append(fmt.Sprintf("Starting %s...", t.Name))
 	if recovered {
 		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
 	}
 	var (
 		archive string
 		err     error
 	)
 	switch t.Target {
 	case "nvidia":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		diagLevel := 2
 		if t.params.StressMode {
 			diagLevel = 3
 		}
 		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
 			result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
 			if e != nil {
 				err = e
 			} else {
 				archive = result.Body
 			}
 		} else {
 			archive, err = a.RunNvidiaAcceptancePack("", j.append)
 		}
 	case "nvidia-targeted-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if dur <= 0 {
 			dur = 300
 		}
 		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
 	case "nvidia-bench-perf":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
 			Profile:           t.params.BenchmarkProfile,
 			SizeMB:            t.params.SizeMB,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
 			ParallelGPUs:      t.params.ParallelGPUs,
 			RampStep:          t.params.RampStep,
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-bench-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
 			Profile:           t.params.BenchmarkProfile,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RampStep:          t.params.RampStep,
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-bench-autotune":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
 			Profile: t.params.BenchmarkProfile,
 			SizeMB:  t.params.SizeMB,
 		}, t.params.BenchmarkKind, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
 		if planErr != nil {
 			err = planErr
 			break
 		}
 		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
 			dur = rampPlan.DurationSec
 		}
 		if rampPlan.StaggerSeconds > 0 {
 			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
 		}
 		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
 	case "nvidia-targeted-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
 	case "nvidia-pulse":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
 	case "nvidia-bandwidth":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-interconnect":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
 		if planErr != nil {
 			err = planErr
 			break
 		}
 		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
 			dur = rampPlan.DurationSec
 		}
 		if rampPlan.StaggerSeconds > 0 {
 			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
 		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec:       dur,
 			Loader:            t.params.Loader,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			StaggerSeconds:    rampPlan.StaggerSeconds,
 		}, j.append)
 	case "memory":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
 		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
 	case "cpu":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		if dur <= 0 {
 			if t.params.StressMode {
 				dur = 1800
 			} else {
 				dur = 60
 			}
 		}
 		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
 		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
 	case "amd":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
 	case "amd-mem":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
 	case "amd-bandwidth":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
 	case "amd-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
 	case "memory-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
 	case "sat-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
 	case "platform-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
 		runOpts.Components = t.params.PlatformComponents
 		archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
 	case "audit":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		result, e := a.RunAuditNow(opts.RuntimeMode)
 		if e != nil {
 			err = e
 		} else {
 			for _, line := range splitLines(result.Body) {
 				j.append(line)
 			}
 		}
 	case "support-bundle":
 		j.append("Building support bundle...")
 		archive, err = buildSupportBundle(opts.ExportDir)
 	case "install":
 		if strings.TrimSpace(t.params.Device) == "" {
 			err = fmt.Errorf("device is required")
 			break
 		}
 		installLogPath := platform.InstallLogPath(t.params.Device)
 		j.append("Install log: " + installLogPath)
 		err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
 	case "install-to-ram":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		err = a.RunInstallToRAM(ctx, j.append)
 	default:
 		j.append("ERROR: unknown target: " + t.Target)
 		j.finish("unknown target")
 		return
 	}
 	if archive != "" {
 		archivePath := app.ExtractArchivePath(archive)
 		if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
 			err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
 		}
 		if opts.App != nil && opts.App.StatusDB != nil {
 			app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
 		}
 	}
 	if err != nil {
 		if ctx.Err() != nil {
 			j.append("Aborted.")
 			j.finish("aborted")
 		} else {
 			j.append("ERROR: " + err.Error())
 			j.finish(err.Error())
 		}
 		return
 	}
 	if archive != "" {
 		j.append("Archive: " + archive)
 	}
 	j.finish("")
 }
 func loadPersistedTask(statePath, taskID string) (*Task, error) {
 	data, err := os.ReadFile(statePath)
 	if err != nil {
 		return nil, err
 	}
 	var persisted []persistedTask
 	if err := json.Unmarshal(data, &persisted); err != nil {
 		return nil, err
 	}
 	for _, pt := range persisted {
 		if pt.ID != taskID {
 			continue
 		}
 		t := &Task{
 			ID:             pt.ID,
 			Name:           pt.Name,
 			Target:         pt.Target,
 			Priority:       pt.Priority,
 			Status:         pt.Status,
 			CreatedAt:      pt.CreatedAt,
 			StartedAt:      pt.StartedAt,
 			DoneAt:         pt.DoneAt,
 			ErrMsg:         pt.ErrMsg,
 			LogPath:        pt.LogPath,
 			ArtifactsDir:   pt.ArtifactsDir,
 			ReportJSONPath: pt.ReportJSONPath,
 			ReportHTMLPath: pt.ReportHTMLPath,
 			params:         pt.Params,
 		}
 		ensureTaskReportPaths(t)
 		return t, nil
 	}
 	return nil, fmt.Errorf("task %s not found", taskID)
 }
 func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
 	if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
 		fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
 		return 2
 	}
 	runtimeInfo, err := runtimeenv.Detect("auto")
 	if err != nil {
 		slog.Warn("resolve runtime for task-run", "err", err)
 	}
 	opts := &HandlerOptions{
 		ExportDir:   exportDir,
 		App:         app.New(platform.New()),
 		RuntimeMode: runtimeInfo.Mode,
 	}
 	statePath := filepath.Join(exportDir, "tasks-state.json")
 	task, err := loadPersistedTask(statePath, taskID)
 	if err != nil {
 		fmt.Fprintln(stderr, err.Error())
 		return 1
 	}
 	if task.StartedAt == nil || task.StartedAt.IsZero() {
 		now := time.Now()
 		task.StartedAt = &now
 	}
 	if task.Status == "" {
 		task.Status = TaskRunning
 	}
 	if err := writeTaskRunnerState(task, taskRunnerState{
 		PID:       os.Getpid(),
 		Status:    TaskRunning,
 		UpdatedAt: time.Now().UTC(),
 	}); err != nil {
 		fmt.Fprintln(stderr, err.Error())
 		return 1
 	}
 	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
 	defer cancel()
 	j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
 	executeTaskWithOptions(opts, task, j, ctx)
 	finalizeTaskForResult(task, j.err, ctx.Err() != nil)
 	if err := writeTaskReportArtifacts(task); err != nil {
 		appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
 	}
 	j.closeLog()
 	if err := writeTaskRunnerState(task, taskRunnerState{
 		PID:       os.Getpid(),
 		Status:    task.Status,
 		Error:     task.ErrMsg,
 		UpdatedAt: time.Now().UTC(),
 	}); err != nil {
 		fmt.Fprintln(stderr, err.Error())
 	}
 	if task.ErrMsg != "" {
 		return 1
 	}
 	return 0
 }
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -2,6 +2,7 @@ package webui
 import (
 	"context"
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -12,6 +13,7 @@ import (
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
@@ -161,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	}
 }
 func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
 	globalQueue.mu.Lock()
 	origTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	origCounter := jobCounter.Load()
 	jobCounter.Store(0)
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = origTasks
 		globalQueue.mu.Unlock()
 		jobCounter.Store(origCounter)
 	})
 	if got := newJobID("ignored"); got != "TASK-000" {
 		t.Fatalf("id=%q want TASK-000", got)
 	}
 	if got := newJobID("ignored"); got != "TASK-001" {
 		t.Fatalf("id=%q want TASK-001", got)
 	}
 }
 func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
 	root := t.TempDir()
 	task := &Task{
 		ID:   "TASK-007",
 		Name: "NVIDIA Benchmark",
 	}
 	got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
 	if !strings.HasPrefix(got, "007_") {
 		t.Fatalf("artifacts dir=%q want prefix 007_", got)
 	}
 }
 func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
 	dir := t.TempDir()
 	logPath := filepath.Join(dir, "task.log")
@@ -248,15 +284,205 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
 	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
 }
 func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
 	dir := t.TempDir()
 	metricsPath := filepath.Join(dir, "metrics.db")
 	prevMetricsPath := taskReportMetricsDBPath
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
 	db, err := openMetricsDB(metricsPath)
 	if err != nil {
 		t.Fatalf("openMetricsDB: %v", err)
 	}
 	base := time.Now().UTC().Add(-45 * time.Second)
 	if err := db.Write(platform.LiveMetricSample{
 		Timestamp:  base,
 		CPULoadPct: 42,
 		MemLoadPct: 35,
 		PowerW:     510,
 	}); err != nil {
 		t.Fatalf("Write: %v", err)
 	}
 	_ = db.Close()
 	q := &taskQueue{
 		statePath: filepath.Join(dir, "tasks-state.json"),
 		logsDir:   filepath.Join(dir, "tasks"),
 		trigger:   make(chan struct{}, 1),
 	}
 	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	started := time.Now().UTC().Add(-90 * time.Second)
 	task := &Task{
 		ID:        "task-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskRunning,
 		CreatedAt: started.Add(-10 * time.Second),
 		StartedAt: &started,
 	}
 	q.assignTaskLogPathLocked(task)
 	appendJobLog(task.LogPath, "line-1")
 	job := newTaskJobState(task.LogPath)
 	job.finish("")
 	q.finalizeTaskRun(task, job)
 	if task.Status != TaskDone {
 		t.Fatalf("status=%q want %q", task.Status, TaskDone)
 	}
 	if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
 		t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
 	}
 	if _, err := os.Stat(task.ReportJSONPath); err != nil {
 		t.Fatalf("report json: %v", err)
 	}
 	if _, err := os.Stat(task.ReportHTMLPath); err != nil {
 		t.Fatalf("report html: %v", err)
 	}
 	var report taskReport
 	data, err := os.ReadFile(task.ReportJSONPath)
 	if err != nil {
 		t.Fatalf("ReadFile(report.json): %v", err)
 	}
 	if err := json.Unmarshal(data, &report); err != nil {
 		t.Fatalf("Unmarshal(report.json): %v", err)
 	}
 	if report.ID != task.ID || report.Status != TaskDone {
 		t.Fatalf("report=%+v", report)
 	}
 	if len(report.Charts) == 0 {
 		t.Fatalf("expected charts in report, got none")
 	}
 }
 func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	dir := t.TempDir()
 	metricsPath := filepath.Join(dir, "metrics.db")
 	prevMetricsPath := taskReportMetricsDBPath
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
 	benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	result := platform.NvidiaBenchmarkResult{
 		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
 		BenchmarkProfile: "standard",
 		OverallStatus:    "OK",
 		GPUs: []platform.BenchmarkGPUResult{
 			{
 				Index: 0,
 				Name:  "NVIDIA H100 PCIe",
 				Scores: platform.BenchmarkScorecard{
 					CompositeScore: 1176.25,
 				},
 			},
 		},
 	}
 	raw, err := json.Marshal(result)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
 		t.Fatal(err)
 	}
 	artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
 	if err := os.MkdirAll(artifactsDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	task := &Task{
 		ID:           "task-bench",
 		Name:         "NVIDIA Bee Bench Perf",
 		Target:       "nvidia-bench-perf",
 		Status:       TaskDone,
 		CreatedAt:    time.Now().UTC().Add(-time.Minute),
 		ArtifactsDir: artifactsDir,
 	}
 	ensureTaskReportPaths(task)
 	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
 	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := writeTaskReportArtifacts(task); err != nil {
 		t.Fatalf("writeTaskReportArtifacts: %v", err)
 	}
 	body, err := os.ReadFile(task.ReportHTMLPath)
 	if err != nil {
 		t.Fatalf("ReadFile(report.html): %v", err)
 	}
 	html := string(body)
 	for _, needle := range []string{
 		`Perf Results`,
 		`Composite score for this benchmark task.`,
 		`GPU 0`,
 		`1176.25`,
 	} {
 		if !strings.Contains(html, needle) {
 			t.Fatalf("report missing %q: %s", needle, html)
 		}
 	}
 }
 func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
 	var lines []string
 	prev := taskSerialWriteLine
 	taskSerialWriteLine = func(line string) { lines = append(lines, line) }
 	t.Cleanup(func() { taskSerialWriteLine = prev })
 	dir := t.TempDir()
 	q := &taskQueue{
 		statePath: filepath.Join(dir, "tasks-state.json"),
 		logsDir:   filepath.Join(dir, "tasks"),
 		trigger:   make(chan struct{}, 1),
 	}
 	task := &Task{
 		ID:        "task-serial-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskPending,
 		CreatedAt: time.Now().UTC(),
 	}
 	q.enqueue(task)
 	started := time.Now().UTC()
 	task.Status = TaskRunning
 	task.StartedAt = &started
 	job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
 	job.append("Starting CPU SAT...")
 	job.append("CPU stress duration: 60s")
 	job.finish("")
 	q.finalizeTaskRun(task, job)
 	joined := strings.Join(lines, "\n")
 	for _, needle := range []string{
 		"queued",
 		"Starting CPU SAT...",
 		"CPU stress duration: 60s",
 		"finished with status=done",
 	} {
 		if !strings.Contains(joined, needle) {
 			t.Fatalf("serial mirror missing %q in %q", needle, joined)
 		}
 	}
 }
 func TestResolveBurnPreset(t *testing.T) {
 	tests := []struct {
 		profile string
 		want    burnPreset
 	}{
-		{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
+		{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
-		{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
+		{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
-		{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
+		{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
-		{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
+		{profile: "", want: burnPreset{DurationSec: 5 * 60}},
 	}
 	for _, tc := range tests {
 		if got := resolveBurnPreset(tc.profile); got != tc.want {
@@ -265,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
 	}
 }
 func TestResolveNvidiaRampPlan(t *testing.T) {
 	tests := []struct {
 		name     string
 		profile  string
 		enabled  bool
 		selected []int
 		want     nvidiaRampSpec
 		wantErr  string
 	}{
 		{
 			name:     "disabled uses base preset",
 			profile:  "acceptance",
 			selected: []int{0, 1},
 			want:     nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
 		},
 		{
 			name:     "smoke ramp uses two minute steps",
 			profile:  "smoke",
 			enabled:  true,
 			selected: []int{0, 1, 2},
 			want:     nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
 		},
 		{
 			name:     "acceptance ramp uses ten minute steps",
 			profile:  "acceptance",
 			enabled:  true,
 			selected: []int{0, 1, 2},
 			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
 		},
 		{
 			name:     "overnight stays at eight hours when possible",
 			profile:  "overnight",
 			enabled:  true,
 			selected: []int{0, 1, 2},
 			want:     nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
 		},
 		{
 			name:     "overnight extends to keep one hour after final gpu",
 			profile:  "overnight",
 			enabled:  true,
 			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
 			want:     nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
 		},
 		{
 			name:     "overnight rejects impossible gpu count",
 			profile:  "overnight",
 			enabled:  true,
 			selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
 			wantErr:  "at most 10 GPUs",
 		},
 		{
 			name:    "enabled requires explicit selection",
 			profile: "smoke",
 			enabled: true,
 			wantErr: "requires explicit GPU selection",
 		},
 	}
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
 			if tc.wantErr != "" {
 				if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
 					t.Fatalf("err=%v want substring %q", err, tc.wantErr)
 				}
 				return
 			}
 			if err != nil {
 				t.Fatalf("resolveNvidiaRampPlan error: %v", err)
 			}
 			if got != tc.want {
 				t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
 			}
 		})
 	}
 }
 func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
 	tests := []struct {
 		loader string
@@ -369,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
 	}
 }
 func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
 	var gotSizeMB, gotPasses int
 	q := &taskQueue{
 		opts: &HandlerOptions{App: &app.App{}},
 	}
 	tk := &Task{
 		ID:        "mem-validate-1",
 		Name:      "Memory SAT",
 		Target:    "memory",
 		Status:    TaskRunning,
 		CreatedAt: time.Now(),
 		params:    taskParams{StressMode: true},
 	}
 	j := &jobState{}
 	orig := runMemoryAcceptancePackCtx
 	runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
 		gotSizeMB = sizeMB
 		gotPasses = passes
 		return "/tmp/memory-validate.tar.gz", nil
 	}
 	defer func() { runMemoryAcceptancePackCtx = orig }()
 	q.runTask(tk, j, context.Background())
 	if gotSizeMB != 512 || gotPasses != 1 {
 		t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
 	}
 }
 func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
@@ -467,3 +800,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
 		t.Fatalf("unexpected error: %q", j.err)
 	}
 }
 func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
 		opts:        &HandlerOptions{App: &app.App{}},
 		statePath:   filepath.Join(dir, "tasks-state.json"),
 		logsDir:     filepath.Join(dir, "tasks"),
 		kmsgWatcher: newKmsgWatcher(nil),
 	}
 	tk := &Task{
 		ID:        "cpu-panic-1",
 		Name:      "CPU SAT",
 		Target:    "cpu",
 		Status:    TaskRunning,
 		CreatedAt: time.Now(),
 	}
 	j := &jobState{}
 	orig := runCPUAcceptancePackCtx
 	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
 		panic("boom")
 	}
 	defer func() { runCPUAcceptancePackCtx = orig }()
 	q.executeTask(tk, j, context.Background())
 	if tk.Status != TaskFailed {
 		t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
 	}
 	if tk.DoneAt == nil {
 		t.Fatal("expected done_at to be set")
 	}
 	if !strings.Contains(tk.ErrMsg, "task panic: boom") {
 		t.Fatalf("task error=%q", tk.ErrMsg)
 	}
 	if !strings.Contains(j.err, "task panic: boom") {
 		t.Fatalf("job error=%q", j.err)
 	}
 	q.kmsgWatcher.mu.Lock()
 	activeCount := q.kmsgWatcher.activeCount
 	window := q.kmsgWatcher.window
 	q.kmsgWatcher.mu.Unlock()
 	if activeCount != 0 {
 		t.Fatalf("activeCount=%d want 0", activeCount)
 	}
 	if window != nil {
 		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
 	}
 }
--- a/2
+++ b/2
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -0,0 +1,277 @@
 # Benchmark clock calibration research
 ## Benchmark methodology versioning
 Every benchmark methodology change must bump the benchmark version constant in
 source code by exactly `+1`.
 Methodology change means any change that affects comparability of benchmark
 results, including for example:
 - phase durations or phase order
 - enabled/disabled precisions
 - fallback rules
 - normalization rules
 - score formulas or weights
 - degradation thresholds
 - power calibration logic
 - thermal/power penalty logic
 Requirements:
 - benchmark version must be stored in source code as an explicit version
  constant, not inferred from git tag or build metadata
 - benchmark report must always print the benchmark version
 - `result.json` must always include the benchmark version
 - results from different benchmark versions must be treated as non-comparable by
  default
 Purpose:
 - prevent accidental comparison of runs produced by different methodologies
 - make historical benchmark archives self-describing even when detached from git
 - force deliberate version bumps whenever scoring or execution semantics change
 ## Status
 In progress. Baseline data from production servers pending.
 ## Background
 The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
 before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
 `avg_steady_clock < locked_target * 0.90`.
 Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
 even a healthy GPU in a non-ideal server will sustain clocks well below boost.
 The 90% threshold has no empirical basis.
 ## Key observations (2026-04-06)
 ### H100 PCIe — new card, server not designed for it
 - avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
 - Thermal sustain: 0.0 (sw_thermal covers entire steady window)
 - Stability: 70.0 — clocks erratic, no equilibrium found
 - Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
 ### H200 NVL — new card, server not designed for it
 - avg clock = P95 = 1635 MHz (perfectly stable)
 - Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
 - Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
 - Degradation: power_capped, thermal_limited
 - Compute: 989 TOPS — card is computing correctly for its frequency
 ### Key insight
 The meaningful distinction is not *whether* the card throttles but *how stably*
 it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
 H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
 instability may reflect a more severe thermal mismatch or a card issue.
 `sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
 `hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
 ## Hypothesis for baseline
 After testing on servers designed for their GPUs (proper cooling):
 - Healthy GPU under sustained load will run at a stable fraction of boost
 - Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
 - Base clock (`clocks.base.gr`) may be a better reference than boost:
  a healthy card under real workload should comfortably exceed base clock
 ## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
 Source: external stress test tool, ~90s runs, designed server, adequate power.
 ### Healthy fingerprint
 - **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
 - **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
  - Avg steady (visual): **~1580–1620 MHz**
  - vs boost 1755 MHz: **~91–92%**
  - Oscillation is NORMAL — this is the boost algorithm balancing under power cap
  - Stable power + oscillating clocks = healthy power-cap behavior
 - **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
 - **Consistency**: all 10 samples within ±20 MHz — very repeatable
 ### Characteristic patten
 Flat power line + oscillating/declining clock line = GPU correctly managed by
 power cap algorithm. Do NOT flag this as instability.
 ### Clock CV implication
 The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
 The current `variance_too_high` threshold (StabilityScore < 85) may fire on
 healthy HBM2e PCIe cards. Needs recalibration.
 ---
 ## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
 Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
 Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
 ### GPU clock reference (from nvidia-smi, idle):
 - base_clock_mhz: **1095**
 - boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
 - achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
 - Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
 ### Observed under 700W sustained load (both samples nearly identical):
 - Power: ~700W flat — SXM slot, adequate power confirmed
 - Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
 - vs 1980 MHz (lock target): **72–74%** — severely below
 - vs 1755 MHz (nvidia-smi boost): **81–83%**
 - vs 1095 MHz (base): 130% — above base but far below expected for SXM
 - Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
 - Temperature: 38°C → 79–80°C (same rate as HBM2e)
 - Oscillation: present, similar character to HBM2e but at much lower frequency
 ### Diagnosis
 These restored cards are degraded. A healthy H100 SXM in a designed server
 (DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
 The 72–74% result is a clear signal of silicon or VRM degradation from the
 refurbishment process.
 ### Clock pattern note
 Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
 to images 19/20. Both sample sets show same degraded pattern — same batch.
 ---
 ## Baseline matrix (filled where data available)
 | GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
 |---|---|---|---|---|---|
 | H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
 | H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
 | H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
 | H200 NVL | designed | TBD | TBD | TBD | need baseline |
 ---
 ## H100 official spec (from NVIDIA datasheet)
 Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
 All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
 | Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
 |---|---|---|---|---|---|
 | H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
 | H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
 | H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
 | H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
 Notes:
 - SXM boards do NOT list FP8 peak in this table (field empty)
 - fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
 - Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
 ## Observed efficiency (H100 80GB PCIe, throttled server)
 From the report in this session (power+thermal throttle throughout steady):
 | Precision | Measured | Spec (dense) | % of spec |
 |---|---|---|---|
 | fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
 | fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
 | fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
 33–44% of spec is expected given sustained power+thermal throttle (avg clock
 1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
 actual frequency — the low TOPS comes from throttle, not silicon defect.
 ## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
 Format: without sparsity / with sparsity.
 | Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
 |---|---|---|---|---|---|
 | H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
 | H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
 ## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
 Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
 | Precision | Measured | Spec (dense) | % of spec |
 |---|---|---|---|
 | fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
 | fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
 | fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
 Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
 both are throttle-limited. Confirms that % of spec is not a quality signal,
 it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
 ## Real-world GEMM efficiency reference (2026-04-06, web research)
 Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
 worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
 ### What healthy systems actually achieve:
 - H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
 - cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
 - H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
 ### Our results vs expectation:
 | GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
 |---|---|---|---|---|
 | H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
 | H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
 Our results are roughly **half** of what a healthy system achieves even under throttle.
 This is NOT normal — 30-44% is not the industry baseline.
 ### Likely causes of the gap (in order of probability):
 1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
 2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
   Previous user may have set a lower limit via nvidia-smi -pl and it was not
   reset. Our normalization sets clock locks but does NOT reset power limit.
   Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
 3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
   8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
 ### Power limit gap analysis (H100 PCIe):
 - Avg clock 1384 MHz = 79% of boost 1755 MHz
 - Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
 - Actually measured: 329 TOPS = 55% of that estimate
 - Remaining gap after accounting for clock throttle: ~45%
 - Most likely explanation: enforced power limit < 350W TDP, further reducing
  sustainable clock beyond what sw_thermal alone would cause.
 ### Action item:
 Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
 so result.json shows if the card was pre-configured with a non-default limit.
 If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
 ### CPU/RAM impact on GPU FLOPS:
 None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
 CPU core count and host RAM are irrelevant.
 ## Compute efficiency metric (proposed, no hardcode)
 Instead of comparing TOPS to a hardcoded spec, compute:
  tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
 This is model-agnostic. A GPU computing correctly at its actual frequency
 will show a consistent tops_per_sm_per_ghz regardless of throttle level.
 A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
 normal clocks.
 SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
 (needs to be added to queryBenchmarkGPUInfo).
 Reference values to establish after baseline runs:
 - H100 PCIe fp16_tensor: TBD tops/SM/GHz
 - H100 SXM fp16_tensor: TBD tops/SM/GHz
 ## Proposed threshold changes (pending more data)
 1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
   91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
   capture the root cause.
 2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
   under power cap. Consider suppressing this flag when power is flat and usage
   is 100% (oscillation is expected). Or lower threshold to 70.
 3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
   ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
   would have been caught by this).
 Decision deferred until baseline on SXM designed servers collected.
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -0,0 +1,121 @@
 # GPU Model Name Propagation
 How GPU model names are detected, stored, and displayed throughout the project.
 ---
 ## Detection Sources
 There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
 ### Pipeline A — Live / SAT (nvidia-smi query at runtime)
 **File:** `audit/internal/platform/sat.go`
 - `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
 - `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
 - Used by: GPU selection UI, live metrics labels, burn/stress test logic
 ### Pipeline B — Benchmark results
 **File:** `audit/internal/platform/benchmark.go`, line 124
 - `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
 - Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
 - Used by: benchmark history table, benchmark report
 ### Pipeline C — Hardware audit JSON (PCIe schema)
 **File:** `audit/internal/schema/hardware.go`
 - `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
 - For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
 - For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
 - Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
 ---
 ## Key Inconsistency: NVIDIA PCIe Model is Never Set
 `audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
 This means:
 - Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
 - AMD GPUs do have their model populated
 The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
 ---
 ## Benchmark History "Unknown GPU" Issue
 **Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
 **Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
 This happens for:
 - Older result files saved before the `Name` field was added
 - Runs where nvidia-smi query failed before the benchmark started
 ---
 ## Fallback Strings — Current State
 | Location | File | Fallback string |
 |---|---|---|
 | Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
 | Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
 | Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
 | Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
 | Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
 | Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
 | SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
 | GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
 **Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
 ---
 ## GPU Selection UI
 **File:** `audit/internal/webui/pages.go`
 - Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
 - Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
 - Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
 This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
 ---
 ## Data Flow Summary
 ```
 nvidia-smi (live)
  └─ ListNvidiaGPUs() → NvidiaGPU.Name
       ├─ GPU selection UI (always correct)
       ├─ Live metrics labels (charts_svg.go)
       └─ SAT/burn status file (sat.go)
 nvidia-smi (at benchmark start)
  └─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
       └─ BenchmarkGPUResult.Name (json:"name,omitempty")
            ├─ Benchmark report
            └─ Benchmark history table columns
 nvidia-smi / lspci (audit collection)
  └─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
       └─ Hardware summary page hwDescribeGPU()
 ```
 ---
 ## Fixed Issues
 All previously open items are resolved:
 1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
 2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
 3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
 4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
 5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -15,6 +15,41 @@ This applies to:
 - `iso/builder/config/package-lists/*.list.chroot`
 - Any package referenced in bootloader configs, hooks, or overlay scripts
 ## Bootloader sync rule
 The ISO has two independent bootloader configs that must be kept in sync manually:
 | File | Used by |
 |------|---------|
 | `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
 | `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
 live-build does NOT derive one from the other. Any new boot entry, kernel parameter
 change, or new mode added to one file must be manually mirrored in the other.
 **Canonical entry list** (both files must have all of these):
 | Label | Key params |
 |-------|-----------|
 | normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
 | load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
 | GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
 | KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
 | KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
 | fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
 **Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
 ```
 net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
 numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 nowatchdog nosoftlockup
 ```
 (fail-safe is the exception — it deliberately uses minimal params.)
 **Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
 live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
 configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
 ## Memtest rule
 Do not assume live-build's built-in memtest integration is sufficient for `bee`.
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -1,12 +1,13 @@
 DEBIAN_VERSION=12
 DEBIAN_KERNEL_ABI=auto
 NVIDIA_DRIVER_VERSION=590.48.01
 NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
 NCCL_VERSION=2.28.9-1
 NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
 NCCL_TESTS_VERSION=2.13.10
 NVCC_VERSION=12.8
-CUBLAS_VERSION=13.0.2.14-1
+CUBLAS_VERSION=13.1.1.3-1
 CUDA_USERSPACE_VERSION=13.0.96-1
 DCGM_VERSION=4.5.3-1
 JOHN_JUMBO_COMMIT=67fcf9fe5a
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
 COMGR_VERSION=2.8.0.60304-76~22.04
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
 MEMTEST_VERSION=6.10-4
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -23,16 +23,17 @@ lb config noauto \
    --bootloaders "grub-efi,syslinux" \
    --debian-installer none \
    --archive-areas "main contrib non-free non-free-firmware" \
-    --mirror-bootstrap "https://deb.debian.org/debian" \
+    --mirror-bootstrap "http://mirror.mephi.ru/debian/" \
-    --mirror-chroot "https://deb.debian.org/debian" \
+    --mirror-chroot "http://mirror.mephi.ru/debian/" \
-    --mirror-binary "https://deb.debian.org/debian" \
+    --mirror-binary "http://mirror.mephi.ru/debian/" \
    --security true \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --debootstrap-options "--include=ca-certificates" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -33,10 +33,10 @@ typedef void *CUstream;
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
 #define MAX_STRESS_STREAMS 16
 #define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
-#define STRESS_LAUNCH_DEPTH 8
+#define MAX_SINGLE_PRECISION_STREAMS 4
 #define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
 static const char *ptx_source =
    ".version 6.0\n"
@@ -298,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }
 static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
    }
    return profile_budget_bytes;
 }
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -344,7 +351,6 @@ static int run_ptx_fallback(struct cuda_api *api,
    unsigned long iterations = 0;
    int mp_count = 0;
    int stream_count = 1;
    int launches_per_wave = 0;
    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "driver-ptx");
@@ -419,44 +425,42 @@ static int run_ptx_fallback(struct cuda_api *api,
    unsigned int threads = 256;
-    double start = now_seconds();
+    double deadline = now_seconds() + (double)seconds;
-    double deadline = start + (double)seconds;
+    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        launches_per_wave = 0;
+        int launched = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+        for (int lane = 0; lane < stream_count; lane++) {
-            int launched_this_batch = 0;
+            unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
-            for (int lane = 0; lane < stream_count; lane++) {
+            if (!check_rc(api,
-                unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+                          "cuLaunchKernel",
-                if (!check_rc(api,
+                          api->cuLaunchKernel(kernel,
-                              "cuLaunchKernel",
+                                              blocks,
-                              api->cuLaunchKernel(kernel,
+                                              1,
-                                                  blocks,
+                                              1,
-                                                  1,
+                                              threads,
-                                                  1,
+                                              1,
-                                                  threads,
+                                              1,
-                                                  1,
+                                              0,
-                                                  1,
+                                              streams[lane],
-                                                  0,
+                                              params[lane],
-                                                  streams[lane],
+                                              NULL))) {
-                                                  params[lane],
+                goto fail;
                                                  NULL))) {
                    goto fail;
                }
                launches_per_wave++;
                launched_this_batch++;
            }
            if (launched_this_batch <= 0) {
                break;
            }
            launched++;
            iterations++;
        }
-        if (launches_per_wave <= 0) {
+        if (launched <= 0) {
            goto fail;
        }
-        if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
+        double now = now_seconds();
-            goto fail;
+        if (now >= next_sync || now >= deadline) {
            if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
                goto fail;
            }
            next_sync = now + 1.0;
        }
        iterations += (unsigned long)launches_per_wave;
    }
    api->cuCtxSynchronize();
    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
        goto fail;
@@ -468,11 +472,10 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
             size_mb,
             report->buffer_mb,
             report->stream_count,
             STRESS_LAUNCH_DEPTH,
             bytes_per_stream[0] / (1024u * 1024u),
             iterations);
@@ -606,6 +609,20 @@ struct prepared_profile {
 };
 static const struct profile_desc k_profiles[] = {
    {
        "fp64",
        "fp64",
        80,
        1,
        0,
        0,
        8,
        CUDA_R_64F,
        CUDA_R_64F,
        CUDA_R_64F,
        CUDA_R_64F,
        CUBLAS_COMPUTE_64F,
    },
    {
        "fp32_tf32",
        "fp32",
@@ -634,6 +651,20 @@ static const struct profile_desc k_profiles[] = {
        CUDA_R_16F,
        CUBLAS_COMPUTE_32F_FAST_16F,
    },
    {
        "int8_tensor",
        "int8",
        75,
        1,
        0,
        0,
        128,
        CUDA_R_8I,
        CUDA_R_8I,
        CUDA_R_32I,
        CUDA_R_32I,
        CUBLAS_COMPUTE_32I,
    },
    {
        "fp8_e4m3",
        "fp8",
@@ -680,6 +711,21 @@ static const struct profile_desc k_profiles[] = {
 #endif
 };
 #define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
 static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
    if (!(desc->enabled && cc >= desc->min_cc)) {
        return 0;
    }
    if (precision_filter != NULL) {
        return strcmp(desc->block_label, precision_filter) == 0;
    }
    /* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
     * unstable on the current benchmark fleet and can abort the whole mixed
     * pass after earlier phases already collected useful telemetry. */
    return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
 }
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -750,10 +796,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
 static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
    switch (type) {
        case CUDA_R_32F:
        case CUDA_R_32I:
            return (size_t)(elements * 4u);
        case CUDA_R_16F:
        case CUDA_R_16BF:
            return (size_t)(elements * 2u);
        case CUDA_R_8I:
        case CUDA_R_8F_E4M3:
        case CUDA_R_8F_E5M2:
            return (size_t)(elements);
@@ -766,6 +814,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
    }
 }
 static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
    if (desc->compute_type == CUBLAS_COMPUTE_32I) {
        return CUDA_R_32I;
    }
    if (desc->compute_type == CUBLAS_COMPUTE_64F) {
        return CUDA_R_64F;
    }
    return CUDA_R_32F;
 }
 static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
    uint64_t row_tiles = (rows + 127u) / 128u;
    uint64_t col_tiles = (cols + 63u) / 64u;
@@ -872,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
    memset(out, 0, sizeof(*out));
    out->desc = *desc;
    out->stream = stream;
    size_t bytes_per_cell = 0;
    size_t attempt_budget = profile_budget_bytes;
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -885,105 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }
-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
-    out->m = dim;
+        memset(out, 0, sizeof(*out));
-    out->n = dim;
+        out->desc = *desc;
-    out->k = dim;
+        out->stream = stream;
-    size_t desired_workspace = profile_budget_bytes / 8u;
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
-    if (desired_workspace > 32u * 1024u * 1024u) {
+        out->m = dim;
-        desired_workspace = 32u * 1024u * 1024u;
+        out->n = dim;
-    }
+        out->k = dim;
    desired_workspace = round_down_size(desired_workspace, 256u);
-    size_t a_bytes = 0;
+        size_t desired_workspace = attempt_budget / 8u;
-    size_t b_bytes = 0;
+        if (desired_workspace > 32u * 1024u * 1024u) {
-    size_t c_bytes = 0;
+            desired_workspace = 32u * 1024u * 1024u;
-    size_t d_bytes = 0;
+        }
-    size_t scale_bytes = 0;
+        desired_workspace = round_down_size(desired_workspace, 256u);
    while (1) {
        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+        size_t a_bytes = 0;
-        if (matrix_bytes <= profile_budget_bytes) {
+        size_t b_bytes = 0;
-            size_t remaining = profile_budget_bytes - matrix_bytes;
+        size_t c_bytes = 0;
-            out->workspace_size = desired_workspace;
+        size_t d_bytes = 0;
-            if (out->workspace_size > remaining) {
+        size_t scale_bytes = 0;
-                out->workspace_size = round_down_size(remaining, 256u);
+        while (1) {
            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
            if (matrix_bytes <= attempt_budget) {
                size_t remaining = attempt_budget - matrix_bytes;
                out->workspace_size = desired_workspace;
                if (out->workspace_size > remaining) {
                    out->workspace_size = round_down_size(remaining, 256u);
                }
                break;
            }
-            break;
+
            if (out->m <= (uint64_t)desc->min_multiple) {
                break;
            }
            out->m -= (uint64_t)desc->min_multiple;
            out->n = out->m;
            out->k = out->m;
        }
        if (out->m < (uint64_t)desc->min_multiple) {
            attempt_budget /= 2u;
            continue;
        }
-        if (out->m <= (uint64_t)desc->min_multiple) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-            return 0;
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        }
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        out->m -= (uint64_t)desc->min_multiple;
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
        out->n = out->m;
        out->k = out->m;
    }
    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (!check_cublas("cublasLtMatmulDescCreate",
                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    cublasOperation_t transa = CUBLAS_OP_T;
    cublasOperation_t transb = CUBLAS_OP_N;
    if (!check_cublas("set TRANSA",
                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                             CUBLASLT_MATMUL_DESC_TRANSA,
                                                             &transa,
                                                             sizeof(transa))) ||
        !check_cublas("set TRANSB",
                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                             CUBLASLT_MATMUL_DESC_TRANSB,
                                                             &transb,
                                                             sizeof(transb)))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (desc->needs_scalar_scale) {
        float one = 1.0f;
        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+        cudaDataType_t scale_type = matmul_scale_type(desc);
        if (!check_cublas("cublasLtMatmulDescCreate",
                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+        cublasOperation_t transa = CUBLAS_OP_T;
-        if (!check_cublas("set A scale ptr",
+        cublasOperation_t transb = CUBLAS_OP_N;
        if (!check_cublas("set TRANSA",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
-                                                                 &a_scale_ptr,
+                                                                 &transa,
-                                                                 sizeof(a_scale_ptr))) ||
+                                                                 sizeof(transa))) ||
-            !check_cublas("set B scale ptr",
+            !check_cublas("set TRANSB",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
-                                                                 &b_scale_ptr,
+                                                                 &transb,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 sizeof(transb)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-    }
+
        if (desc->needs_scalar_scale) {
            float one = 1.0f;
            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
            if (!check_cublas("set A scale ptr",
                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
                                                                     &a_scale_ptr,
                                                                     sizeof(a_scale_ptr))) ||
                !check_cublas("set B scale ptr",
                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
                                                                     &b_scale_ptr,
                                                                     sizeof(b_scale_ptr)))) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
        }
 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
    if (desc->needs_block_scale) {
@@ -1023,78 +1089,94 @@ static int prepare_profile(struct cublaslt_api *cublas,
    }
 #endif
-    if (!check_cublas("create A layout",
+        if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
+            !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
+            !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
+            !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (out->workspace_size > 0) {
        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        if (out->workspace_size > 0) {
            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
        }
        if (!check_cublas("set workspace",
                          cublas->cublasLtMatmulPreferenceSetAttribute(
                              out->preference,
                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                              &out->workspace_size,
                              sizeof(out->workspace_size)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        int found = 0;
        if (check_cublas("heuristic",
                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
                                                                out->op_desc,
                                                                out->a_layout,
                                                                out->b_layout,
                                                                out->c_layout,
                                                                out->d_layout,
                                                                out->preference,
                                                                1,
                                                                &out->heuristic,
                                                                &found)) &&
            found > 0) {
            out->ready = 1;
            return 1;
        }
        destroy_profile(cublas, cuda, out);
        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
            break;
        }
    }
-    if (!check_cublas("set workspace",
+    return 0;
                      cublas->cublasLtMatmulPreferenceSetAttribute(
                          out->preference,
                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                          &out->workspace_size,
                          sizeof(out->workspace_size)))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    int found = 0;
    if (!check_cublas("heuristic",
                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
                                                             out->op_desc,
                                                             out->a_layout,
                                                             out->b_layout,
                                                             out->c_layout,
                                                             out->d_layout,
                                                             out->preference,
                                                             1,
                                                             &out->heuristic,
                                                             &found))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (found <= 0) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    out->ready = 1;
    return 1;
 }
 static int run_cublas_profile(cublasLtHandle_t handle,
                              struct cublaslt_api *cublas,
                              struct prepared_profile *profile) {
    int32_t alpha_i32 = 1;
    int32_t beta_i32 = 0;
    double alpha_f64 = 1.0;
    double beta_f64 = 0.0;
    float alpha = 1.0f;
    float beta = 0.0f;
    const void *alpha_ptr = &alpha;
    const void *beta_ptr = &beta;
    if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
        alpha_ptr = &alpha_i32;
        beta_ptr = &beta_i32;
    } else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
        alpha_ptr = &alpha_f64;
        beta_ptr = &beta_f64;
    }
    return check_cublas(profile->desc.name,
                        cublas->cublasLtMatmul(handle,
                                               profile->op_desc,
-                                               &alpha,
+                                               alpha_ptr,
                                               (const void *)(uintptr_t)profile->a_dev,
                                               profile->a_layout,
                                               (const void *)(uintptr_t)profile->b_dev,
                                               profile->b_layout,
-                                               &beta,
+                                               beta_ptr,
                                               (const void *)(uintptr_t)profile->c_dev,
                                               profile->c_layout,
                                               (void *)(uintptr_t)profile->d_dev,
@@ -1112,9 +1194,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                               int cc_minor,
                               int seconds,
                               int size_mb,
                               const char *precision_filter,
                               struct stress_report *report) {
    struct cublaslt_api cublas;
-    struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
+    struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
    cublasLtHandle_t handle = NULL;
    CUcontext ctx = NULL;
    CUstream streams[MAX_STRESS_STREAMS] = {0};
@@ -1124,12 +1207,12 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    int active = 0;
    int mp_count = 0;
    int stream_count = 1;
-    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
+    int profile_count = PROFILE_COUNT;
    int prepared_count = 0;
    int wave_launches = 0;
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
    int budget_profiles = 0;
    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1150,8 +1233,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }
    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned++;
        }
    }
@@ -1162,18 +1246,42 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }
    /* Count all profiles active on this GPU regardless of filter.
     * Mixed phases still divide budget across the full precision set, while
     * single-precision benchmark phases dedicate budget only to active
     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned_total++;
        }
    }
    if (planned_total < planned) {
        planned_total = planned;
    }
    budget_profiles = planned_total;
    if (precision_filter != NULL) {
        budget_profiles = planned;
    }
    if (budget_profiles <= 0) {
        budget_profiles = planned_total;
    }
    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
    }
    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1186,19 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
    if (precision_filter != NULL) {
        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  STRESS_LAUNCH_DEPTH,
                  mp_count,
                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));
    for (int i = 0; i < profile_count; i++) {
@@ -1211,6 +1322,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
        if (!profile_allowed_for_run(desc, cc, precision_filter)) {
            append_detail(report->details,
                          sizeof(report->details),
                          "%s=SKIPPED benchmark_disabled\n",
                          desc->name);
            continue;
        }
        for (int lane = 0; lane < stream_count; lane++) {
            CUstream stream = streams[lane];
            if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
@@ -1246,50 +1364,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }
    /* Keep the GPU queue continuously full by submitting kernels without
     * synchronizing after every wave.  A sync barrier after each small batch
     * creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
     * especially when individual kernels are short.  Instead we sync at most
     * once per second (for error detection) and once at the very end. */
    double deadline = now_seconds() + (double)seconds;
    double next_sync = now_seconds() + 1.0;
    while (now_seconds() < deadline) {
-        wave_launches = 0;
+        int launched = 0;
-        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+        for (int i = 0; i < prepared_count; i++) {
-            int launched_this_batch = 0;
+            if (!prepared[i].ready) {
-            for (int i = 0; i < prepared_count; i++) {
+                continue;
                if (!prepared[i].ready) {
                    continue;
                }
                if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
                    append_detail(report->details,
                                  sizeof(report->details),
                                  "%s=FAILED runtime\n",
                                  prepared[i].desc.name);
                    for (int j = 0; j < prepared_count; j++) {
                        destroy_profile(&cublas, cuda, &prepared[j]);
                    }
                    cublas.cublasLtDestroy(handle);
                    destroy_streams(cuda, streams, stream_count);
                    cuda->cuCtxDestroy(ctx);
                    return 0;
                }
                prepared[i].iterations++;
                report->iterations++;
                wave_launches++;
                launched_this_batch++;
            }
-            if (launched_this_batch <= 0) {
+            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
-                break;
+                append_detail(report->details,
                              sizeof(report->details),
                              "%s=FAILED runtime\n",
                              prepared[i].desc.name);
                for (int j = 0; j < prepared_count; j++) {
                    destroy_profile(&cublas, cuda, &prepared[j]);
                }
                cublas.cublasLtDestroy(handle);
                destroy_streams(cuda, streams, stream_count);
                cuda->cuCtxDestroy(ctx);
                return 0;
            }
            prepared[i].iterations++;
            report->iterations++;
            launched++;
        }
-        if (wave_launches <= 0) {
+        if (launched <= 0) {
            break;
        }
-        if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
+        double now = now_seconds();
-            for (int i = 0; i < prepared_count; i++) {
+        if (now >= next_sync || now >= deadline) {
-                destroy_profile(&cublas, cuda, &prepared[i]);
+            if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
                for (int i = 0; i < prepared_count; i++) {
                    destroy_profile(&cublas, cuda, &prepared[i]);
                }
                cublas.cublasLtDestroy(handle);
                destroy_streams(cuda, streams, stream_count);
                cuda->cuCtxDestroy(ctx);
                return 0;
            }
-            cublas.cublasLtDestroy(handle);
+            next_sync = now + 1.0;
            destroy_streams(cuda, streams, stream_count);
            cuda->cuCtxDestroy(ctx);
            return 0;
        }
    }
    /* Final drain — ensure all queued work finishes before we read results. */
    cuda->cuCtxSynchronize();
    for (int i = 0; i < prepared_count; i++) {
        if (!prepared[i].ready) {
@@ -1323,10 +1446,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
 }
 #endif
 static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
    printf("device=%s\n", report->device);
    printf("device_index=%d\n", device_index);
    printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
    printf("backend=%s\n", report->backend);
    printf("duration_s=%d\n", seconds);
    printf("buffer_mb=%d\n", report->buffer_mb);
    printf("streams=%d\n", report->stream_count);
    printf("iterations=%lu\n", report->iterations);
    printf("checksum=%llu\n", (unsigned long long)report->checksum);
    if (report->details[0] != '\0') {
        printf("%s", report->details);
    }
    printf("status=OK\n");
 }
 int main(int argc, char **argv) {
    int seconds = 5;
    int size_mb = 64;
    int device_index = 0;
    const char *precision_filter = NULL; /* NULL = all; else block_label to match */
    const char *precision_plan = NULL;
    const char *precision_plan_seconds = NULL;
    for (int i = 1; i < argc; i++) {
        if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
            seconds = atoi(argv[++i]);
@@ -1334,8 +1476,16 @@ int main(int argc, char **argv) {
            size_mb = atoi(argv[++i]);
        } else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
            device_index = atoi(argv[++i]);
        } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
            precision_filter = argv[++i];
        } else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
            precision_plan = argv[++i];
        } else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
            precision_plan_seconds = argv[++i];
        } else {
-            fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
+            fprintf(stderr,
                    "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
                    argv[0]);
            return 2;
        }
    }
@@ -1395,26 +1545,94 @@ int main(int argc, char **argv) {
    int ok = 0;
 #if HAVE_CUBLASLT_HEADERS
-    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
+    if (precision_plan != NULL && precision_plan[0] != '\0') {
        char *plan_copy = strdup(precision_plan);
        char *plan_seconds_copy = NULL;
        int phase_seconds[32] = {0};
        int phase_seconds_count = 0;
        int phase_ok = 0;
        if (plan_copy == NULL) {
            fprintf(stderr, "failed to allocate precision plan buffer\n");
            return 1;
        }
        if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
            plan_seconds_copy = strdup(precision_plan_seconds);
            if (plan_seconds_copy == NULL) {
                free(plan_copy);
                fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
                return 1;
            }
            for (char *sec_token = strtok(plan_seconds_copy, ",");
                 sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
                 sec_token = strtok(NULL, ",")) {
                while (*sec_token == ' ' || *sec_token == '\t') {
                    sec_token++;
                }
                if (*sec_token == '\0') {
                    continue;
                }
                phase_seconds[phase_seconds_count++] = atoi(sec_token);
            }
        }
        int phase_idx = 0;
        for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
            while (*token == ' ' || *token == '\t') {
                token++;
            }
            if (*token == '\0') {
                continue;
            }
            const char *phase_name = token;
            const char *phase_filter = token;
            if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
                phase_filter = NULL;
            }
            int phase_duration = seconds;
            if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
                phase_duration = phase_seconds[phase_idx];
            }
            printf("phase_begin=%s\n", phase_name);
            fflush(stdout);
            memset(&report, 0, sizeof(report));
            ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
            if (ok) {
                print_stress_report(&report, device_index, phase_duration);
                phase_ok = 1;
            } else {
                printf("phase_error=%s\n", phase_name);
                if (report.details[0] != '\0') {
                    printf("%s", report.details);
                    if (report.details[strlen(report.details) - 1] != '\n') {
                        printf("\n");
                    }
                }
                printf("status=FAILED\n");
            }
            printf("phase_end=%s\n", phase_name);
            fflush(stdout);
        }
        free(plan_seconds_copy);
        free(plan_copy);
        return phase_ok ? 0 : 1;
    }
    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
 #endif
    if (!ok) {
-        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
+        if (precision_filter != NULL) {
            fprintf(stderr,
                    "requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
                    precision_filter,
                    name,
                    cc_major,
                    cc_minor);
            return 1;
        }
        int ptx_mb = size_mb;
        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
            return 1;
        }
    }
-    printf("device=%s\n", report.device);
+    print_stress_report(&report, device_index, seconds);
    printf("device_index=%d\n", device_index);
    printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
    printf("backend=%s\n", report.backend);
    printf("duration_s=%d\n", seconds);
    printf("buffer_mb=%d\n", report.buffer_mb);
    printf("streams=%d\n", report.stream_count);
    printf("iterations=%lu\n", report.iterations);
    printf("checksum=%llu\n", (unsigned long long)report.checksum);
    if (report.details[0] != '\0') {
        printf("%s", report.details);
    }
    printf("status=OK\n");
    return 0;
 }
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
+            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
            exit 1
            ;;
    esac
 done
 case "$VARIANT" in
-    nvidia|amd|nogpu|all) ;;
+    nvidia|nvidia-legacy|amd|nogpu|all) ;;
-    *) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
+    *) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
 esac
 if [ "$CLEAN_CACHE" = "1" ]; then
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
           "${CACHE_DIR:?}/lb-packages"
    echo "=== cleaning live-build work dirs ==="
    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
    rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
    rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
    rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
    echo "=== caches cleared, proceeding with build ==="
 fi
@@ -156,6 +161,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}" \
@@ -170,6 +176,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}"
@@ -180,6 +187,9 @@ case "$VARIANT" in
    nvidia)
        run_variant nvidia
        ;;
    nvidia-legacy)
        run_variant nvidia-legacy
        ;;
    amd)
        run_variant amd
        ;;
@@ -188,6 +198,7 @@ case "$VARIANT" in
        ;;
    all)
        run_variant nvidia
        run_variant nvidia-legacy
        run_variant amd
        run_variant nogpu
        ;;
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -1,8 +1,10 @@
 #!/bin/sh
-# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
+# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
 #
 # Downloads the official NVIDIA .run installer, extracts kernel modules and
-# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
+# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
 #   - open         -> kernel-open/ sources from the .run installer
 #   - proprietary  -> traditional proprietary kernel sources from the .run installer
 #
 # Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
 # are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
@@ -17,10 +19,19 @@ set -e
 NVIDIA_VERSION="$1"
 DIST_DIR="$2"
 DEBIAN_KERNEL_ABI="$3"
 NVIDIA_FLAVOR="${4:-open}"
-[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$NVIDIA_VERSION" ]    || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
-[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$DIST_DIR" ]          || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
-[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
+[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
 case "$NVIDIA_FLAVOR" in
    open|proprietary) ;;
    *)
        echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
        exit 1
        ;;
 esac
 KVER="${DEBIAN_KERNEL_ABI}-amd64"
 # On Debian, kernel headers are split into two packages:
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
 KDIR_ARCH="/usr/src/linux-headers-${KVER}"
 KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
-echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
+echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
-if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
+CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
    echo "=== installing linux-headers-${KVER} ==="
    DEBIAN_FRONTEND=noninteractive apt-get install -y \
        "linux-headers-${KVER}" \
        gcc make perl
 fi
 echo "kernel headers (arch):   $KDIR_ARCH"
 echo "kernel headers (common): $KDIR_COMMON"
 CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
-CACHE_LAYOUT_VERSION="2"
+CACHE_LAYOUT_VERSION="3"
 CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
 if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
        && [ -f "$CACHE_LAYOUT_MARKER" ] \
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
    exit 0
 fi
 if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
    echo "=== installing linux-headers-${KVER} ==="
    DEBIAN_FRONTEND=noninteractive apt-get install -y \
        "linux-headers-${KVER}" \
        gcc make perl
 fi
 echo "kernel headers (arch):   $KDIR_ARCH"
 echo "kernel headers (common): $KDIR_COMMON"
 # Download official NVIDIA .run installer with sha256 verification
 BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
 mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
 rm -rf "$EXTRACT_DIR"
 "$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
-# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
+# Find kernel source directory for the selected flavor.
 KERNEL_SRC=""
-for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
+if [ "$NVIDIA_FLAVOR" = "open" ]; then
-    [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
+    for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
-done
+        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
-[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
+    done
 else
    for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
        [ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
    done
 fi
 [ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
 echo "kernel source: $KERNEL_SRC"
 # Build kernel modules
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -15,28 +15,49 @@ DIST_DIR="${REPO_ROOT}/dist"
 VENDOR_DIR="${REPO_ROOT}/iso/vendor"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 AUTH_KEYS=""
 BUILD_VARIANT="nvidia"
 BEE_GPU_VENDOR="nvidia"
 BEE_NVIDIA_MODULE_FLAVOR="open"
 # parse args
 while [ $# -gt 0 ]; do
    case "$1" in
        --authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
-        --variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
+        --variant) BUILD_VARIANT="$2"; shift 2 ;;
        *) echo "unknown arg: $1"; exit 1 ;;
    esac
 done
-case "$BEE_GPU_VENDOR" in
+case "$BUILD_VARIANT" in
-    nvidia|amd|nogpu) ;;
+    nvidia)
-    *) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
+        BEE_GPU_VENDOR="nvidia"
        BEE_NVIDIA_MODULE_FLAVOR="open"
        ;;
    nvidia-legacy)
        BEE_GPU_VENDOR="nvidia"
        BEE_NVIDIA_MODULE_FLAVOR="proprietary"
        ;;
    amd)
        BEE_GPU_VENDOR="amd"
        BEE_NVIDIA_MODULE_FLAVOR=""
        ;;
    nogpu)
        BEE_GPU_VENDOR="nogpu"
        BEE_NVIDIA_MODULE_FLAVOR=""
        ;;
    *)
        echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
        exit 1
        ;;
 esac
-BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
+BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
-OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
+OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
-export BEE_GPU_VENDOR
+export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
 . "${BUILDER_DIR}/VERSIONS"
 export MEMTEST_VERSION
 export PATH="$PATH:/usr/local/go/bin"
 : "${BEE_REQUIRE_MEMTEST:=0}"
@@ -105,6 +126,37 @@ resolve_iso_version() {
    resolve_audit_version
 }
 sync_builder_workdir() {
    src_dir="$1"
    dst_dir="$2"
    mkdir -p "$dst_dir"
    # Historical bug: old workdirs could keep config/bootloaders/grub-pc even
    # after the source tree moved to grub-efi only. Remove bootloaders eagerly
    # so reused workdirs cannot leak stale templates into a new ISO build.
    rm -rf "$dst_dir/config/bootloaders"
    rsync -a --delete \
        --exclude='cache/' \
        --exclude='chroot/' \
        --exclude='.build/' \
        --exclude='*.iso' \
        --exclude='*.packages' \
        --exclude='*.contents' \
        --exclude='*.files' \
        "$src_dir/" "$dst_dir/"
    if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
        echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
        exit 1
    fi
    if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
        echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
        exit 1
    fi
 }
 iso_list_files() {
    iso_path="$1"
@@ -182,7 +234,7 @@ dump_memtest_debug() {
        echo "-- source bootloader templates --"
        for cfg in \
-            "${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
+            "${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
            "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
            if [ -f "$cfg" ]; then
                echo "  file: $cfg"
@@ -302,6 +354,12 @@ memtest_fail() {
    return 0
 }
 nvidia_runtime_fail() {
    msg="$1"
    echo "ERROR: ${msg}" >&2
    exit 1
 }
 iso_memtest_present() {
    iso_path="$1"
    iso_files="$(mktemp)"
@@ -439,6 +497,113 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }
 validate_iso_live_boot_entries() {
    iso_path="$1"
    echo "=== validating live boot entries in ISO ==="
    [ -f "$iso_path" ] || {
        echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
        exit 1
    }
    require_iso_reader "$iso_path" >/dev/null 2>&1 || {
        echo "ERROR: ISO reader unavailable for live boot validation" >&2
        exit 1
    }
    grub_cfg="$(mktemp)"
    isolinux_cfg="$(mktemp)"
    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
        echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
        echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
        echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    fi
    grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
        echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
        echo "ERROR: GRUB toram entry is missing" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'linux .*boot=live ' "$grub_cfg" || {
        echo "ERROR: GRUB live entry is missing boot=live" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
        echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'append .*boot=live ' "$isolinux_cfg" || {
        echo "ERROR: isolinux live entry is missing boot=live" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
        echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    rm -f "$grub_cfg" "$isolinux_cfg"
    echo "=== live boot validation OK ==="
 }
 validate_iso_nvidia_runtime() {
    iso_path="$1"
    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
    echo "=== validating NVIDIA runtime in ISO ==="
    [ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
    require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
    command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
    squashfs_tmp="$(mktemp)"
    squashfs_list="$(mktemp)"
    iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
    }
    unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
    }
    grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
    }
    grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
    }
    grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
        rm -f "$squashfs_tmp" "$squashfs_list"
        nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
    }
    rm -f "$squashfs_tmp" "$squashfs_list"
    echo "=== NVIDIA runtime validation OK ==="
 }
 append_memtest_grub_entry() {
    grub_cfg="$1"
    [ -f "$grub_cfg" ] || return 1
@@ -477,6 +642,185 @@ label memtest
 EOF
 }
 extract_live_grub_entry() {
    cfg="$1"
    live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
    live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
    [ -n "$live_linux" ] || return 1
    [ -n "$live_initrd" ] || return 1
    grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
    grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
    grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
    [ -n "$grub_kernel" ] || return 1
    [ -n "$grub_append" ] || return 1
    [ -n "$grub_initrd" ] || return 1
    return 0
 }
 load_live_build_append() {
    lb_dir="$1"
    binary_cfg="$lb_dir/config/binary"
    [ -f "$binary_cfg" ] || return 1
    # config/binary is generated by live-build and contains shell variable
    # assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
    # shellcheck disable=SC1090
    . "$binary_cfg"
    [ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
    live_build_append="$LB_BOOTAPPEND_LIVE"
    return 0
 }
 extract_live_isolinux_entry() {
    cfg="$1"
    isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
    isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
    isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
    [ -n "$isolinux_linux" ] || return 1
    [ -n "$isolinux_initrd" ] || return 1
    [ -n "$isolinux_append" ] || return 1
    isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
    isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
    [ -n "$isolinux_kernel" ] || return 1
    [ -n "$isolinux_initrd_path" ] || return 1
    return 0
 }
 write_canonical_grub_cfg() {
    cfg="$1"
    kernel="$2"
    append_live="$3"
    initrd="$4"
    cat > "$cfg" <<EOF
 source /boot/grub/config.cfg
 echo ""
 echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
 echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
 echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
 echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
    linux   ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }
 menuentry "EASY-BEE -- load to RAM (toram)" {
    linux   ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }
 if [ "\${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
    }
 else
    menuentry "Memory Test (memtest86+)" {
        linux16 /boot/memtest86+x64.bin
    }
 fi
 if [ "\${grub_platform}" = "efi" ]; then
    menuentry "UEFI Firmware Settings" {
        fwsetup
    }
 fi
 EOF
 }
 write_canonical_isolinux_cfg() {
    cfg="$1"
    kernel="$2"
    initrd="$3"
    append_live="$4"
    cat > "$cfg" <<EOF
 label live-@FLAVOUR@-normal
    menu label ^EASY-BEE
    menu default
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^KMS, no nomodeset)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (KMS, ^GSP=off)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
 label memtest
    menu label ^Memory Test (memtest86+)
    linux /boot/memtest86+x64.bin
 EOF
 }
 enforce_live_build_bootloader_assets() {
    lb_dir="$1"
    grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
    grub_dir="$lb_dir/binary/boot/grub"
    isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
    if ! load_live_build_append "$lb_dir"; then
        echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
        live_build_append=""
    fi
    if [ -f "$grub_cfg" ]; then
        if extract_live_grub_entry "$grub_cfg"; then
            mkdir -p "$grub_dir/live-theme"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
            cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
            echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
        fi
    fi
    if [ -f "$isolinux_cfg" ]; then
        if extract_live_isolinux_entry "$isolinux_cfg"; then
            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
            echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
        fi
    fi
 }
 copy_memtest_from_deb() {
    deb="$1"
    dst_boot="$2"
@@ -583,7 +927,7 @@ recover_iso_memtest() {
 AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
 ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
-ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
+ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
 # Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
 OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
 mkdir -p "${OUT_DIR}"
@@ -711,6 +1055,7 @@ run_optional_step_sh() {
        return 0
    fi
    mkdir -p "${LOG_DIR}" 2>/dev/null || true
    step_log="${LOG_DIR}/${step_slug}.log"
    echo ""
    echo "=== optional step: ${step_name} ==="
@@ -734,13 +1079,14 @@ start_build_log
 # install them on the fly so NVIDIA modules and ISO kernel always match.
 if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
    echo "=== refreshing apt index to detect current kernel ABI ==="
-    apt-get update -qq
+    apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
    DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
        | awk '/Depends:.*linux-image-[0-9]/{print $2}' \
        | grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
        | head -1)
    if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
        echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
        echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
        exit 1
    fi
    echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
@@ -757,7 +1103,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
    apt-get install -y "linux-headers-${KVER}"
 fi
-echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
 echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
 echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
 echo ""
@@ -809,9 +1155,37 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
    echo "=== bee-gpu-burn FP4 header probe ==="
    fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
    fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
    if [ -n "$fp4_type_match" ]; then
        echo "fp4_header_symbol=present"
        echo "$fp4_type_match"
    else
        echo "fp4_header_symbol=missing"
    fi
    if [ -n "$fp4_scale_match" ]; then
        echo "fp4_scale_mode_symbol=present"
        echo "$fp4_scale_match"
    else
        echo "fp4_scale_mode_symbol=missing"
    fi
    GPU_STRESS_NEED_BUILD=1
-    if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
+    if [ -f "$GPU_BURN_WORKER_BIN" ]; then
        GPU_STRESS_NEED_BUILD=0
        for dep in \
            "${BUILDER_DIR}/bee-gpu-stress.c" \
            "${BUILDER_DIR}/VERSIONS"; do
            if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
                GPU_STRESS_NEED_BUILD=1
                break
            fi
        done
        if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
            find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
            GPU_STRESS_NEED_BUILD=1
        fi
    fi
    if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
@@ -825,21 +1199,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    else
        echo "=== bee-gpu-burn worker up to date, skipping build ==="
    fi
    echo "=== bee-gpu-burn compiled profile probe ==="
    if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
        echo "fp4_profile_string=present"
    else
        echo "fp4_profile_string=missing"
    fi
 fi
-echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
+echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
 # Sync builder config into variant work dir, preserving lb cache.
-rsync -a --delete \
+sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
    --exclude='cache/' \
    --exclude='chroot/' \
    --exclude='.build/' \
    --exclude='*.iso' \
    --exclude='*.packages' \
    --exclude='*.contents' \
    --exclude='*.files' \
    "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
 # Share deb package cache across variants.
 # Restore: populate work dir cache from shared cache before build.
@@ -937,10 +1309,10 @@ done
 # --- NVIDIA kernel modules and userspace libs ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
-        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
+        sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
    KVER="${DEBIAN_KERNEL_ABI}-amd64"
-    NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
+    NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
    # Inject .ko files into overlay at /usr/local/lib/nvidia/
    OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
@@ -1011,13 +1383,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
 NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
 CUBLAS_VERSION=${CUBLAS_VERSION}
 CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
 NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
 JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
-    GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
+    GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
    GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
    GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
@@ -1029,6 +1402,7 @@ fi
 cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
 BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
 BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
 BEE_BUILD_VARIANT=${BUILD_VARIANT}
 BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
 BUILD_DATE=${BUILD_DATE}
 GIT_COMMIT=${GIT_COMMIT}
@@ -1039,6 +1413,11 @@ EOF
 # Write GPU vendor marker for hooks
 echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
 else
    rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
 fi
 # Patch motd with build info
 BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
@@ -1074,6 +1453,7 @@ fi
 # --- substitute version placeholders in package list and archive ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    sed -i \
        -e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
        -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
        "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
@@ -1109,17 +1489,25 @@ fi
 # --- build ISO using live-build ---
 echo ""
-echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
+echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
 # Export for auto/config
-BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
+BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER
 cd "${LB_DIR}"
-run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
+run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
 run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
 dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
 echo "=== enforcing canonical bootloader assets ==="
 enforce_live_build_bootloader_assets "${LB_DIR}"
 reset_live_build_stage "${LB_DIR}" "binary_checksums"
 reset_live_build_stage "${LB_DIR}" "binary_iso"
 reset_live_build_stage "${LB_DIR}" "binary_zsync"
 run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
 run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
 run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
 # --- persist deb package cache back to shared location ---
 # This allows the second variant to reuse all downloaded packages.
@@ -1144,9 +1532,11 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
    validate_iso_live_boot_entries "$ISO_RAW"
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
-    echo "=== done (${BEE_GPU_VENDOR}) ==="
+    echo "=== done (${BUILD_VARIANT}) ==="
    echo "ISO: $ISO_OUT"
    if command -v stat >/dev/null 2>&1; then
        ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
--- a/iso/builder/config/bootloaders/grub-efi/config.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/config.cfg
@@ -23,9 +23,9 @@ insmod serial
 serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
 insmod gfxterm
 insmod png
 source /boot/grub/theme.cfg
 terminal_input console serial
 terminal_output gfxterm serial
 insmod png
 source /boot/grub/theme.cfg
--- a/iso/builder/config/bootloaders/grub-efi/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/grub.cfg
@@ -0,0 +1,28 @@
 source /boot/grub/config.cfg
 menuentry "EASY-BEE" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE -- load to RAM (toram)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
    }
 else
    menuentry "Memory Test (memtest86+)" {
        linux16 /boot/memtest86+x64.bin
    }
 fi
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "UEFI Firmware Settings" {
        fwsetup
    }
 fi
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
@@ -5,6 +5,13 @@ title-text: ""
 message-font: "Unifont Regular 16"
 terminal-font: "Unifont Regular 16"
 #bee logo - centered, upper third of screen
 + image {
        top = 4%
        left = 50%-200
        file = "bee-logo.png"
 }
 #help bar at the bottom
 + label {
        top = 100%-50
@@ -21,17 +28,17 @@ terminal-font: "Unifont Regular 16"
 + boot_menu {
        left = 20%
        width = 60%
-        top = 62%
+        top = 65%
-        height = 38%-80
+        height = 35%-80
        item_color = "#c88000"
        item_font = "Unifont Regular 16"
        selected_item_color= "#f5a800"
        selected_item_font = "Unifont Regular 16"
-        item_height = 16
+        item_height = 20
-        item_padding = 0
+        item_padding = 2
        item_spacing = 4
        icon_width = 0
-        icon_heigh = 0
+        icon_height = 0
        item_icon_space = 0
 }
--- a/iso/builder/config/bootloaders/grub-efi/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/theme.cfg
@@ -0,0 +1,9 @@
 set color_normal=light-gray/black
 set color_highlight=yellow/black
 if [ -e /boot/grub/live-theme/theme.txt ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
    set menu_color_normal=yellow/black
    set menu_color_highlight=white/brown
 fi
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -1,56 +0,0 @@
 source /boot/grub/config.cfg
 echo ""
 echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
 echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
 echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
 echo ""
 menuentry "EASY-BEE" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (graphics/KMS)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (load to RAM)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (NVIDIA GSP=off)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 menuentry "EASY-BEE (fail-safe)" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
    initrd  @INITRD_LIVE@
 }
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
    }
 else
    menuentry "Memory Test (memtest86+)" {
        linux16 /boot/memtest86+x64.bin
    }
 fi
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "UEFI Firmware Settings" {
        fwsetup
    }
 fi
--- a/iso/builder/config/bootloaders/grub-pc/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/theme.cfg
@@ -1,9 +0,0 @@
 set color_normal=light-gray/black
 set color_highlight=white/dark-gray
 if [ -e /boot/grub/splash.png ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
    set menu_color_normal=cyan/black
    set menu_color_highlight=white/dark-gray
 fi
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
    menu default
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=normal
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ toram bee.nvidia.mode=normal
+    append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-label live-@FLAVOUR@-kms-gsp-off
+label live-@FLAVOUR@-kms
-    menu label EASY-BEE (g^raphics/KMS, GSP=off)
+    menu label EASY-BEE (^KMS, no nomodeset)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
+    append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (KMS, ^GSP=off)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
 label memtest
    menu label ^Memory Test (memtest86+)
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -25,11 +25,14 @@ ensure_bee_console_user() {
 ensure_bee_console_user
 # Enable common bee services
 systemctl enable bee-hpc-tuning.service
 systemctl enable bee-network.service
 systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
 systemctl enable bee-web.service
 systemctl enable bee-sshsetup.service
 systemctl enable bee-selfheal.timer
 systemctl enable bee-boot-status.service
 systemctl enable ssh.service
 systemctl enable lightdm.service 2>/dev/null || true
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -40,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
 # Enable GPU-vendor specific services
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    systemctl enable nvidia-dcgm.service 2>/dev/null || true
    systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
    systemctl enable bee-nvidia.service
 elif [ "$GPU_VENDOR" = "amd" ]; then
    # ROCm symlinks (packages install to /opt/rocm-*/bin/)
@@ -53,11 +57,16 @@ fi
 # nogpu: no GPU services needed
 # Ensure scripts are executable
 chmod +x /usr/local/bin/bee-hpc-tuning  2>/dev/null || true
 chmod +x /usr/local/bin/bee-network.sh  2>/dev/null || true
 chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
 chmod +x /usr/local/bin/bee-selfheal        2>/dev/null || true
 chmod +x /usr/local/bin/bee-boot-status    2>/dev/null || true
 chmod +x /usr/local/bin/bee-install        2>/dev/null || true
 chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
+++ b/iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
@@ -0,0 +1,41 @@
 #!/bin/sh
 # 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
 #
 # live-boot tries "losetup --replace --direct-io=on" when re-associating the
 # loop device to the RAM copy in /dev/shm.  tmpfs does not support O_DIRECT,
 # so the ioctl returns EINVAL and the verification step fails.
 #
 # The patch replaces the replace call so that if --direct-io=on fails it falls
 # back to a plain replace without direct-io, and also relaxes the verification
 # to a warning so the boot continues even when re-association is imperfect.
 set -e
 TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
 if [ ! -f "${TORAM_SCRIPT}" ]; then
    echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
    exit 0
 fi
 echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
 # Replace any losetup --replace call that includes --direct-io=on with a
 # version that first tries with direct-io, then retries without it.
 #
 # The sed expression turns:
 #   losetup --replace ... --direct-io=on LOOP FILE
 # into a shell snippet that tries both, silently.
 #
 # We also downgrade the fatal "Task finished with error." block to a warning
 # so the boot continues if re-association fails (squashfs still accessible).
 # 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
 sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
 sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
 # 2. Turn the hard error into a warning so boot continues.
 #    live-boot prints this exact string when verification fails.
 sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
 echo "9010-fix-toram: patch applied"
 grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
--- a/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
+++ b/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
@@ -0,0 +1,46 @@
 #!/bin/sh
 # 9011-toram-rsync.hook.chroot
 #
 # Adds rsync to the initramfs so that live-boot's toram code takes the
 # rsync --progress path instead of the silent "cp -a" fallback.
 #
 # live-boot's 9990-toram-todisk.sh already contains:
 #   if [ -x /bin/rsync ]; then
 #       rsync -a --progress ... 1>/dev/console
 #   else
 #       cp -a ...   # no output
 #   fi
 #
 # We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
 # which copies the binary + all shared-library dependencies into the initrd.
 set -e
 HOOK_DIR="/etc/initramfs-tools/hooks"
 HOOK="${HOOK_DIR}/bee-rsync"
 mkdir -p "${HOOK_DIR}"
 cat > "${HOOK}" << 'EOF'
 #!/bin/sh
 # initramfs hook: include rsync for live-boot toram progress output
 PREREQ=""
 prereqs() { echo "$PREREQ"; }
 case "$1" in prereqs) prereqs; exit 0 ;; esac
 . /usr/share/initramfs-tools/hook-functions
 if [ -x /usr/bin/rsync ]; then
    copy_exec /usr/bin/rsync /bin
 fi
 EOF
 chmod +x "${HOOK}"
 echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
 # Rebuild initramfs so the hook takes effect in the ISO's initrd.img
 KVER=$(ls /lib/modules | sort -V | tail -1)
 echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
 update-initramfs -u -k "${KVER}"
 echo "9011-toram-rsync: done"
--- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary
+++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary
@@ -5,6 +5,8 @@ set -e
 : "${BEE_REQUIRE_MEMTEST:=0}"
 # memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
 # while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
 MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
 BINARY_BOOT_DIR="binary/boot"
 GRUB_CFG="binary/boot/grub/grub.cfg"
@@ -24,15 +26,23 @@ fail_or_warn() {
    return 0
 }
 # grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
 # creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
 # The template already has memtest entries hardcoded, so a missing config file
 # here is not an error; validate_iso_memtest() checks the final ISO instead.
 warn_only() {
    log "WARNING: $1"
 }
 copy_memtest_file() {
    src="$1"
-    base="$(basename "$src")"
+    dst_name="${2:-$(basename "$src")}"
-    dst="${BINARY_BOOT_DIR}/${base}"
+    dst="${BINARY_BOOT_DIR}/${dst_name}"
    [ -f "$src" ] || return 1
    mkdir -p "${BINARY_BOOT_DIR}"
    cp "$src" "$dst"
-    log "copied ${base} from ${src}"
+    log "copied ${dst_name} from ${src}"
 }
 extract_memtest_from_deb() {
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {
    log "extracting memtest payload from ${deb}"
    dpkg-deb -x "$deb" "$tmpdir"
-    for f in ${MEMTEST_FILES}; do
+
-        if [ -f "${tmpdir}/boot/${f}" ]; then
+    # EFI binary: both 5.x and 6.x use memtest86+x64.efi
-            copy_memtest_file "${tmpdir}/boot/${f}"
+    if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
-        fi
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
-    done
+    fi
    # BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
    if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
        copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
    elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
        copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
    fi
    rm -rf "$tmpdir"
 }
 download_and_extract_memtest() {
    tmpdl="$(mktemp -d)"
    if [ -n "${MEMTEST_VERSION:-}" ]; then
        pkg_spec="memtest86+=${MEMTEST_VERSION}"
    else
        pkg_spec="memtest86+"
    fi
    log "downloading ${pkg_spec} from apt"
    if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
        log "apt download failed, retrying after apt-get update"
        apt-get update -qq >/dev/null 2>&1 || true
        ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
    fi
    deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
    if [ -n "$deb" ]; then
        extract_memtest_from_deb "$deb"
    else
        log "apt download of memtest86+ failed"
    fi
    rm -rf "$tmpdl"
 }
 ensure_memtest_binaries() {
    missing=0
    for f in ${MEMTEST_FILES}; do
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0
    # 1. Try files already placed by lb binary_memtest or chroot
    for root in chroot/boot /boot; do
        for f in ${MEMTEST_FILES}; do
            [ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
        done
        # 6.x BIOS binary may lack x64 in name — copy with normalised name
        if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
            copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
        fi
    done
    missing=0
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0
    # 2. Try apt package cache (may be empty if lb binary_memtest already purged)
    for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
        [ -d "$root" ] || continue
        deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
        break
    done
    missing=0
    for f in ${MEMTEST_FILES}; do
        [ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
    done
    [ "$missing" -eq 1 ] || return 0
    # 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
    download_and_extract_memtest
    missing=0
    for f in ${MEMTEST_FILES}; do
        if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {
 ensure_grub_entry() {
    [ -f "$GRUB_CFG" ] || {
-        fail_or_warn "missing ${GRUB_CFG}"
+        warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
        return 0
    }
@@ -114,7 +169,7 @@ EOF
 ensure_isolinux_entry() {
    [ -f "$ISOLINUX_CFG" ] || {
-        fail_or_warn "missing ${ISOLINUX_CFG}"
+        warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
        return 0
    }
--- a/Show More
+++ b/Show More