Add vendor RAID tools for livecd

Fix fast-path: treat bootloader config changes as heavy
config/bootloaders was missing from the needs_full_build heavy-file list, so changes to GRUB theme assets (e.g. bee-logo.png RGBA→RGB fix in 333c44f) were silently skipped by the squashfs-surgery fast-path. The old broken PNG stayed in boot/grub/live-theme/ inside the ISO. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 17:31:25 +03:00 · 2026-04-29 15:36:29 +03:00 · 2026-04-29 13:18:50 +03:00 · 2026-04-29 12:34:54 +03:00 · 2026-04-29 11:15:16 +03:00 · 2026-04-29 10:58:26 +03:00
105 changed files with 12317 additions and 4222 deletions
@@ -3,3 +3,4 @@
 dist/
 iso/out/
 build-cache/
+audit/bee
@@ -2,6 +2,7 @@ package main

 import (
 	"context"
+	"errors"
 	"flag"
 	"fmt"
 	"io"
@@ -67,10 +68,14 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 		return runSupportBundle(args[1:], stdout, stderr)
 	case "web":
 		return runWeb(args[1:], stdout, stderr)
+	case "blackbox":
+		return runBlackbox(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
 	case "benchmark":
 		return runBenchmark(args[1:], stdout, stderr)
+	case "bee-worker":
+		return runBeeWorker(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -88,8 +93,10 @@ func printRootUsage(w io.Writer) {
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
+  bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
  bee benchmark nvidia [--profile standard|stability|overnight]
+  bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
  bee version
  bee help [command]`)
 }
@@ -106,10 +113,14 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runSupportBundle([]string{"--help"}, stdout, stdout)
 	case "web":
 		return runWeb([]string{"--help"}, stdout, stdout)
+	case "blackbox":
+		return runBlackbox([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
 	case "benchmark":
 		return runBenchmark([]string{"--help"}, stdout, stderr)
+	case "bee-worker":
+		return runBeeWorker([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -335,6 +346,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

+func runBlackbox(args []string, stdout, stderr io.Writer) int {
+	fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
+	statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
+	fs.Usage = func() {
+		fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fs.Usage()
+		return 2
+	}
+	slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
+	if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
+		slog.Error("run blackbox", "err", err)
+		return 1
+	}
+	return 0
+}
+
 func runSAT(args []string, stdout, stderr io.Writer) int {
 	if len(args) == 0 {
 		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
@@ -462,6 +500,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

+func runBeeWorker(args []string, stdout, stderr io.Writer) int {
+	fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
+	taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
+	fs.Usage = func() {
+		fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fs.Usage()
+		return 2
+	}
+	return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
+}
+
 func parseBenchmarkIndexCSV(raw string) ([]int, error) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
@@ -5,22 +5,18 @@ go 1.25.0
 replace reanimator/chart => ../internal/chart

 require (
-	github.com/go-analyze/charts v0.5.26
+	modernc.org/sqlite v1.48.0
 	reanimator/chart v0.0.0-00010101000000-000000000000
 )

 require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
-	github.com/go-analyze/bulk v0.1.3 // indirect
-	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
-	golang.org/x/image v0.24.0 // indirect
 	golang.org/x/sys v0.42.0 // indirect
-	modernc.org/libc v1.70.0 // indirect
+	modernc.org/libc v1.72.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
-	modernc.org/sqlite v1.48.0 // indirect
 )
@@ -1,37 +1,51 @@
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
-github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
-github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
-github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
 github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
-golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
-golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
 golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
-modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
+modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
+modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
+modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
+modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
+modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
+modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
+modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
+modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
+modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
+modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
+modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
+modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
+modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
+modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
 modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
 modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
 modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
 modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
+modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
+modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
+modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
 modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
 modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
+modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
+modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
+modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
+modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
@@ -19,20 +19,22 @@ import (
 )

 var (
-	DefaultExportDir        = "/appdata/bee/export"
-	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
-	DefaultBeeBenchBaseDir  = DefaultExportDir + "/bee-bench"
-	DefaultBeeBenchPerfDir  = DefaultBeeBenchBaseDir + "/perf"
-	DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
+	DefaultExportDir                     = "/appdata/bee/export"
+	DefaultAuditJSONPath                 = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath                  = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath                    = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath                = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath                 = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath                    = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath               = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath                = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir                   = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir                    = DefaultExportDir + "/bee-sat"
+	DefaultBeeBenchBaseDir               = DefaultExportDir + "/bee-bench"
+	DefaultBeeBenchAutotuneDir           = DefaultBeeBenchBaseDir + "/autotune"
+	DefaultBeeBenchPerfDir               = DefaultBeeBenchBaseDir + "/perf"
+	DefaultBeeBenchPowerDir              = DefaultBeeBenchBaseDir + "/power"
+	DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
 )

 type App struct {
@@ -125,6 +127,7 @@ type satRunner interface {
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
+	RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -146,7 +149,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }

 type runtimeChecker interface {
@@ -304,7 +307,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	}
 	filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
 	tmpPath := filepath.Join(os.TempDir(), filename)
-	data, err := os.ReadFile(DefaultAuditJSONPath)
+	data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
 	if err != nil {
 		return "", err
 	}
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPerfDir
 	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }

@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPowerDir
 	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBeeBenchAutotuneDir
+	}
+	return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
+}
+
+func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
+	return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
+}
+
+func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
+	cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
+		if logFunc != nil {
+			logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
+		}
+		return *cfg, nil
+	}
+	if logFunc != nil {
+		logFunc("benchmark autotune: no saved power source config, running autotune first")
+	}
+	autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
+	if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
+	if err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	return *cfg, nil
+}
+
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -744,8 +790,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
+}
+
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
@@ -9,6 +9,7 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"

 	"bee/audit/internal/platform"
@@ -123,11 +124,13 @@ type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaAutotuneFn       func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
+	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -162,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
 	return f.runNvidiaFn(baseDir)
 }

+func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
+	if f.runNvidiaAutotuneFn != nil {
+		return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -287,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }

-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNCCLFn != nil {
+		return f.runNCCLFn(baseDir, gpuIndices)
+	}
 	return "", nil
 }

+func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
+	t.Parallel()
+
+	var gotBaseDir string
+	var gotGPUIndices []int
+	a := &App{
+		sat: fakeSAT{
+			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
+				gotBaseDir = baseDir
+				gotGPUIndices = append([]int(nil), gpuIndices...)
+				return "/tmp/nccl-tests.tar.gz", nil
+			},
+		},
+	}
+
+	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
+	if err != nil {
+		t.Fatalf("RunNCCLTests error: %v", err)
+	}
+	if path != "/tmp/nccl-tests.tar.gz" {
+		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
+	}
+	if gotBaseDir != "/tmp/sat" {
+		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
+	}
+	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
+		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
+	}
+}
+
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()

@@ -775,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
 		t.Fatal(err)
 	}
+	if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
+		t.Fatal(err)
+	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -802,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tr := tar.NewReader(gzr)
 	var names []string
 	var auditJSON string
+	var manifest string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -818,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			}
 			auditJSON = string(body)
 		}
+		if strings.HasSuffix(hdr.Name, "/manifest.txt") {
+			body, err := io.ReadAll(tr)
+			if err != nil {
+				t.Fatalf("read manifest entry: %v", err)
+			}
+			manifest = string(body)
+		}
 	}

 	for _, want := range []string{
@@ -861,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
 		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
 	}
+	if !contains(manifest, "files:") {
+		t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
+	}
+	if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
+		t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
+	}
 }

 func TestMainBanner(t *testing.T) {
@@ -2,10 +2,29 @@ package app

 import (
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 )

+// readFileLimited reads path into memory, refusing files larger than maxBytes.
+// Prevents OOM on corrupted or unexpectedly large data files.
+func readFileLimited(path string, maxBytes int64) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > maxBytes {
+		return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
+	}
+	return data, nil
+}
+
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
@@ -0,0 +1,779 @@
+package app
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+const (
+	blackboxMarkerName        = ".bee-blackbox"
+	blackboxDiscoverInterval  = 2 * time.Second
+	blackboxMinFlushPeriod    = 1 * time.Second
+	blackboxMaxFlushPeriod    = 30 * time.Second
+	blackboxRecoveryFastCount = 5
+)
+
+var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
+
+var (
+	blackboxExecCommand = exec.Command
+	blackboxNow         = func() time.Time { return time.Now().UTC() }
+)
+
+type BlackboxMarker struct {
+	Version      int    `json:"version"`
+	EnrollmentID string `json:"enrollment_id"`
+	CreatedAtUTC string `json:"created_at_utc"`
+	Host         string `json:"host,omitempty"`
+}
+
+type BlackboxTargetStatus struct {
+	EnrollmentID      string                   `json:"enrollment_id"`
+	Device            string                   `json:"device"`
+	FS                platform.RemovableTarget `json:"fs"`
+	BootFolder        string                   `json:"boot_folder"`
+	Status            string                   `json:"status"`
+	LastSyncAtUTC     string                   `json:"last_sync_at_utc,omitempty"`
+	LastCycleDuration string                   `json:"last_cycle_duration,omitempty"`
+	FlushPeriod       string                   `json:"flush_period"`
+	LastError         string                   `json:"last_error,omitempty"`
+	Mountpoint        string                   `json:"mountpoint,omitempty"`
+}
+
+type BlackboxState struct {
+	Status           string                 `json:"status"`
+	BootStartedAtUTC string                 `json:"boot_started_at_utc"`
+	BootFolder       string                 `json:"boot_folder"`
+	UpdatedAtUTC     string                 `json:"updated_at_utc"`
+	Targets          []BlackboxTargetStatus `json:"targets"`
+}
+
+type blackboxRuntime struct {
+	exportDir   string
+	statePath   string
+	system      *platform.System
+	bootStarted time.Time
+	bootFolder  string
+
+	mu      sync.Mutex
+	workers map[string]*blackboxWorker
+}
+
+type discoveredBlackboxTarget struct {
+	marker       BlackboxMarker
+	target       platform.RemovableTarget
+	seenMount    string
+	mountedByBee bool
+}
+
+type blackboxWorker struct {
+	runtime      *blackboxRuntime
+	enrollmentID string
+
+	mu           sync.Mutex
+	target       platform.RemovableTarget
+	marker       BlackboxMarker
+	mountpoint   string
+	mountedByBee bool
+	status       string
+	lastSyncAt   time.Time
+	lastDuration time.Duration
+	flushPeriod  time.Duration
+	lastError    string
+	fastCycles   int
+	stopCh       chan struct{}
+	stoppedCh    chan struct{}
+}
+
+func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
+	exportDir = strings.TrimSpace(exportDir)
+	if exportDir == "" {
+		exportDir = DefaultExportDir
+	}
+	statePath = strings.TrimSpace(statePath)
+	if statePath == "" {
+		statePath = DefaultBlackboxStatePath
+	}
+	if system == nil {
+		system = platform.New()
+	}
+	bootStarted, err := bootStartedAtUTC()
+	if err != nil {
+		bootStarted = blackboxNow()
+	}
+	rt := &blackboxRuntime{
+		exportDir:   exportDir,
+		statePath:   statePath,
+		system:      system,
+		bootStarted: bootStarted,
+		bootFolder:  SupportBundleBaseName(bootStarted),
+		workers:     make(map[string]*blackboxWorker),
+	}
+	_ = os.MkdirAll(filepath.Dir(statePath), 0755)
+	rt.persistState()
+	ticker := time.NewTicker(blackboxDiscoverInterval)
+	defer ticker.Stop()
+	for {
+		rt.reconcile()
+		select {
+		case <-ctx.Done():
+			rt.stopAll()
+			return ctx.Err()
+		case <-ticker.C:
+		}
+	}
+}
+
+func ReadBlackboxState(path string) (BlackboxState, error) {
+	path = strings.TrimSpace(path)
+	if path == "" {
+		path = DefaultBlackboxStatePath
+	}
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return BlackboxState{}, err
+	}
+	var state BlackboxState
+	if err := json.Unmarshal(raw, &state); err != nil {
+		return BlackboxState{}, err
+	}
+	return state, nil
+}
+
+func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
+	target = sanitizeRemovableTarget(target)
+	if target.Device == "" {
+		return BlackboxMarker{}, fmt.Errorf("device is required")
+	}
+	mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
+	if err != nil {
+		return BlackboxMarker{}, err
+	}
+	defer func() {
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+	}()
+
+	marker, _, err := readBlackboxMarker(mountpoint)
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return BlackboxMarker{}, err
+	}
+	if marker.EnrollmentID == "" {
+		marker = BlackboxMarker{
+			Version:      1,
+			EnrollmentID: newBlackboxEnrollmentID(),
+			CreatedAtUTC: blackboxNow().Format(time.RFC3339),
+			Host:         hostnameOr("unknown"),
+		}
+	}
+	if err := writeBlackboxMarker(mountpoint, marker); err != nil {
+		return BlackboxMarker{}, err
+	}
+	return marker, nil
+}
+
+func DisableBlackboxTarget(device, enrollmentID string) error {
+	device = strings.TrimSpace(device)
+	enrollmentID = strings.TrimSpace(enrollmentID)
+	if device == "" && enrollmentID == "" {
+		return fmt.Errorf("device or enrollment_id is required")
+	}
+	system := platform.New()
+	targets, err := system.ListRemovableTargets()
+	if err != nil {
+		return err
+	}
+	for _, target := range targets {
+		target = sanitizeRemovableTarget(target)
+		mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
+		if mountErr != nil {
+			continue
+		}
+		remove := false
+		marker, _, err := readBlackboxMarker(mountpoint)
+		if err == nil {
+			if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
+				remove = true
+			}
+			if device != "" && target.Device == device {
+				remove = true
+			}
+		}
+		if remove {
+			err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
+		}
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+		if remove {
+			return err
+		}
+	}
+	return os.ErrNotExist
+}
+
+func (rt *blackboxRuntime) reconcile() {
+	discovered, _ := rt.discoverMarkedTargets()
+
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	seen := make(map[string]struct{}, len(discovered))
+	for _, found := range discovered {
+		seen[found.marker.EnrollmentID] = struct{}{}
+		worker, ok := rt.workers[found.marker.EnrollmentID]
+		if !ok {
+			worker = newBlackboxWorker(rt, found)
+			rt.workers[found.marker.EnrollmentID] = worker
+			go worker.run()
+			continue
+		}
+		worker.update(found)
+	}
+	for id, worker := range rt.workers {
+		if _, ok := seen[id]; ok {
+			continue
+		}
+		worker.stop()
+		delete(rt.workers, id)
+	}
+	rt.persistStateLocked()
+}
+
+func (rt *blackboxRuntime) stopAll() {
+	rt.mu.Lock()
+	workers := make([]*blackboxWorker, 0, len(rt.workers))
+	for _, worker := range rt.workers {
+		workers = append(workers, worker)
+	}
+	rt.workers = map[string]*blackboxWorker{}
+	rt.persistStateLocked()
+	rt.mu.Unlock()
+	for _, worker := range workers {
+		worker.stop()
+	}
+}
+
+func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
+	targets, err := rt.system.ListRemovableTargets()
+	if err != nil {
+		return nil, err
+	}
+	var out []discoveredBlackboxTarget
+	for _, rawTarget := range targets {
+		target := sanitizeRemovableTarget(rawTarget)
+		if target.Device == "" {
+			continue
+		}
+		mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
+		if err != nil {
+			continue
+		}
+		marker, ok, err := readBlackboxMarker(mountpoint)
+		if mountedByBee && !ok {
+			_ = unmountTarget(mountpoint)
+		}
+		if err != nil || !ok || marker.EnrollmentID == "" {
+			continue
+		}
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+		out = append(out, discoveredBlackboxTarget{
+			marker:       marker,
+			target:       target,
+			seenMount:    mountpoint,
+			mountedByBee: mountedByBee,
+		})
+	}
+	sort.Slice(out, func(i, j int) bool {
+		return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
+	})
+	return out, nil
+}
+
+func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
+	return &blackboxWorker{
+		runtime:      rt,
+		enrollmentID: found.marker.EnrollmentID,
+		target:       found.target,
+		marker:       found.marker,
+		flushPeriod:  blackboxMinFlushPeriod,
+		status:       "running",
+		stopCh:       make(chan struct{}),
+		stoppedCh:    make(chan struct{}),
+	}
+}
+
+func (w *blackboxWorker) run() {
+	defer close(w.stoppedCh)
+	for {
+		start := time.Now()
+		err := w.syncCycle()
+		duration := time.Since(start)
+		w.finishCycle(duration, err)
+
+		wait := w.currentFlushPeriod()
+		timer := time.NewTimer(wait)
+		select {
+		case <-w.stopCh:
+			timer.Stop()
+			w.cleanup()
+			return
+		case <-timer.C:
+		}
+	}
+}
+
+func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.target = found.target
+	w.marker = found.marker
+}
+
+func (w *blackboxWorker) stop() {
+	select {
+	case <-w.stopCh:
+	default:
+		close(w.stopCh)
+	}
+	<-w.stoppedCh
+}
+
+func (w *blackboxWorker) currentFlushPeriod() time.Duration {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.flushPeriod
+}
+
+func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.lastDuration = duration
+	if err != nil {
+		w.status = "degraded"
+		w.lastError = err.Error()
+		w.fastCycles = 0
+		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
+	} else {
+		w.status = "running"
+		w.lastSyncAt = blackboxNow()
+		w.lastError = ""
+		if duration <= w.flushPeriod/2 {
+			w.fastCycles++
+		} else {
+			w.fastCycles = 0
+		}
+		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
+	}
+	w.runtime.persistState()
+}
+
+func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
+	if current <= 0 {
+		current = blackboxMinFlushPeriod
+	}
+	if duration <= 0 {
+		duration = current
+	}
+	next := current
+	if duration > current {
+		growA := time.Duration(float64(current) * 1.25)
+		growB := time.Duration(float64(duration) * 1.25)
+		if growB > growA {
+			next = growB
+		} else {
+			next = growA
+		}
+	}
+	if success && fastCycles >= blackboxRecoveryFastCount {
+		next = time.Duration(float64(current) * 0.9)
+	}
+	if next < blackboxMinFlushPeriod {
+		next = blackboxMinFlushPeriod
+	}
+	if next > blackboxMaxFlushPeriod {
+		next = blackboxMaxFlushPeriod
+	}
+	return next
+}
+
+func (w *blackboxWorker) syncCycle() error {
+	target, marker := w.snapshotTarget()
+	mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
+	if err != nil {
+		return err
+	}
+	w.recordMountpoint(mountpoint, mountedByBee)
+
+	root := filepath.Join(mountpoint, w.runtime.bootFolder)
+	if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
+		return err
+	}
+	if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
+		return err
+	}
+	if err := w.captureSnapshots(root); err != nil {
+		return err
+	}
+	return syncFilesystem(root)
+}
+
+func (w *blackboxWorker) cleanup() {
+	w.mu.Lock()
+	mountpoint := w.mountpoint
+	mountedByBee := w.mountedByBee
+	w.mu.Unlock()
+	if mountedByBee && mountpoint != "" {
+		_ = unmountTarget(mountpoint)
+	}
+}
+
+func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.target, w.marker
+}
+
+func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.mountpoint = mountpoint
+	w.mountedByBee = mountedByBee
+}
+
+func (w *blackboxWorker) captureSnapshots(root string) error {
+	if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
+		return err
+	}
+	for _, svc := range supportBundleServices {
+		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
+			return err
+		}
+		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
+			return err
+		}
+	}
+	if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
+		return err
+	}
+	for _, item := range supportBundleOptionalFiles {
+		if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
+			return err
+		}
+	}
+	return nil
+}
+
+func (rt *blackboxRuntime) persistState() {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+	rt.persistStateLocked()
+}
+
+func (rt *blackboxRuntime) persistStateLocked() {
+	state := BlackboxState{
+		Status:           "disabled",
+		BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
+		BootFolder:       rt.bootFolder,
+		UpdatedAtUTC:     blackboxNow().Format(time.RFC3339),
+		Targets:          make([]BlackboxTargetStatus, 0, len(rt.workers)),
+	}
+	if len(rt.workers) > 0 {
+		state.Status = "running"
+	}
+	for _, worker := range rt.workers {
+		worker.mu.Lock()
+		targetState := BlackboxTargetStatus{
+			EnrollmentID: worker.enrollmentID,
+			Device:       worker.target.Device,
+			FS:           worker.target,
+			BootFolder:   rt.bootFolder,
+			Status:       worker.status,
+			FlushPeriod:  worker.flushPeriod.String(),
+			LastError:    worker.lastError,
+			Mountpoint:   worker.mountpoint,
+		}
+		if !worker.lastSyncAt.IsZero() {
+			targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
+		}
+		if worker.lastDuration > 0 {
+			targetState.LastCycleDuration = worker.lastDuration.String()
+		}
+		if worker.status == "degraded" {
+			state.Status = "degraded"
+		}
+		worker.mu.Unlock()
+		state.Targets = append(state.Targets, targetState)
+	}
+	sort.Slice(state.Targets, func(i, j int) bool {
+		return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
+	})
+	_ = writeJSONAtomic(rt.statePath, state)
+}
+
+func bootStartedAtUTC() (time.Time, error) {
+	raw, err := os.ReadFile("/proc/stat")
+	if err != nil {
+		return time.Time{}, err
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		if !strings.HasPrefix(line, "btime ") {
+			continue
+		}
+		parts := strings.Fields(line)
+		if len(parts) != 2 {
+			break
+		}
+		sec, err := time.ParseDuration(parts[1] + "s")
+		if err != nil {
+			break
+		}
+		return time.Unix(int64(sec/time.Second), 0).UTC(), nil
+	}
+	return time.Time{}, fmt.Errorf("boot time not found")
+}
+
+func newBlackboxEnrollmentID() string {
+	var buf [8]byte
+	if _, err := rand.Read(buf[:]); err != nil {
+		return fmt.Sprintf("bb-%d", time.Now().UnixNano())
+	}
+	return "bb-" + hex.EncodeToString(buf[:])
+}
+
+func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
+	target.Device = strings.TrimSpace(target.Device)
+	target.FSType = strings.TrimSpace(target.FSType)
+	target.Size = strings.TrimSpace(target.Size)
+	target.Label = strings.TrimSpace(target.Label)
+	target.Model = strings.TrimSpace(target.Model)
+	target.Mountpoint = strings.TrimSpace(target.Mountpoint)
+	return target
+}
+
+func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
+	target = sanitizeRemovableTarget(target)
+	if target.Mountpoint != "" {
+		if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
+			return target.Mountpoint, false, nil
+		}
+	}
+	mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
+	if err := os.MkdirAll(mountpoint, 0755); err != nil {
+		return "", false, err
+	}
+	if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
+		return "", false, formatBlackboxMountTargetError(target, string(raw), err)
+	}
+	if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
+		_ = unmountTarget(mountpoint)
+		return "", false, err
+	}
+	return mountpoint, true, nil
+}
+
+func unmountTarget(mountpoint string) error {
+	_ = blackboxExecCommand("sync").Run()
+	raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
+	if err != nil {
+		msg := strings.TrimSpace(string(raw))
+		if msg == "" {
+			return err
+		}
+		return fmt.Errorf("%s: %w", msg, err)
+	}
+	return nil
+}
+
+func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
+	raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return BlackboxMarker{}, false, os.ErrNotExist
+		}
+		return BlackboxMarker{}, false, err
+	}
+	var marker BlackboxMarker
+	if err := json.Unmarshal(raw, &marker); err != nil {
+		return BlackboxMarker{}, false, err
+	}
+	return marker, true, nil
+}
+
+func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
+	if marker.Version == 0 {
+		marker.Version = 1
+	}
+	return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
+}
+
+func syncDirectoryTree(srcDir, dstDir string) error {
+	seen := make(map[string]struct{})
+	err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		rel, err := filepath.Rel(srcDir, path)
+		if err != nil {
+			return err
+		}
+		rel = filepath.Clean(rel)
+		if rel == "." {
+			seen["."] = struct{}{}
+			return os.MkdirAll(dstDir, 0755)
+		}
+		seen[rel] = struct{}{}
+		dstPath := filepath.Join(dstDir, rel)
+		if d.IsDir() {
+			info, err := d.Info()
+			if err != nil {
+				return err
+			}
+			return os.MkdirAll(dstPath, info.Mode().Perm())
+		}
+		return copyFileIfChanged(path, dstPath)
+	})
+	if err != nil {
+		return err
+	}
+	return removeMissingPaths(dstDir, seen)
+}
+
+func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
+	return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		rel, err := filepath.Rel(dstDir, path)
+		if err != nil {
+			return err
+		}
+		rel = filepath.Clean(rel)
+		if rel == "." {
+			return nil
+		}
+		if _, ok := seen[rel]; ok {
+			return nil
+		}
+		return os.RemoveAll(path)
+	})
+}
+
+func copyFileIfChanged(src, dst string) error {
+	info, err := os.Stat(src)
+	if err != nil {
+		return err
+	}
+	if info.IsDir() {
+		return os.MkdirAll(dst, info.Mode().Perm())
+	}
+	srcData, err := os.ReadFile(src)
+	if err != nil {
+		return err
+	}
+	if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
+		return nil
+	}
+	return writeFileAtomic(dst, srcData, info.Mode().Perm())
+}
+
+func captureCommandAtomic(dst string, name string, args ...string) error {
+	raw, err := blackboxExecCommand(name, args...).CombinedOutput()
+	if len(raw) == 0 {
+		if err != nil {
+			raw = []byte(err.Error() + "\n")
+		} else {
+			raw = []byte("no output\n")
+		}
+	}
+	return writeFileAtomic(dst, raw, 0644)
+}
+
+func writeJSONAtomic(path string, v any) error {
+	raw, err := json.MarshalIndent(v, "", "  ")
+	if err != nil {
+		return err
+	}
+	raw = append(raw, '\n')
+	return writeFileAtomic(path, raw, 0644)
+}
+
+func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
+		return nil
+	}
+	tmp := path + ".tmp"
+	f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
+	if err != nil {
+		return err
+	}
+	if _, err := f.Write(data); err != nil {
+		_ = f.Close()
+		return err
+	}
+	if err := f.Sync(); err != nil {
+		_ = f.Close()
+		return err
+	}
+	if err := f.Close(); err != nil {
+		return err
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		return err
+	}
+	return syncFilesystem(filepath.Dir(path))
+}
+
+func syncFilesystem(path string) error {
+	return blackboxExecCommand("sync").Run()
+}
+
+func ensureWritableBlackboxMountpoint(mountpoint string) error {
+	probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
+	if err != nil {
+		return fmt.Errorf("target filesystem is not writable: %w", err)
+	}
+	name := probe.Name()
+	if closeErr := probe.Close(); closeErr != nil {
+		_ = os.Remove(name)
+		return closeErr
+	}
+	if err := os.Remove(name); err != nil {
+		return err
+	}
+	return nil
+}
+
+func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
+	msg := strings.TrimSpace(raw)
+	fstype := strings.ToLower(strings.TrimSpace(target.FSType))
+	if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
+		return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
+	}
+	if msg == "" {
+		return err
+	}
+	return fmt.Errorf("%s: %w", msg, err)
+}
@@ -0,0 +1,52 @@
+package app
+
+import (
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
+	current := 2 * time.Second
+	got := adjustFlushPeriod(current, 4*time.Second, false, 0)
+	if got <= current {
+		t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
+	}
+}
+
+func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
+	current := 10 * time.Second
+	got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
+	if got >= current {
+		t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
+	}
+	if got < blackboxMinFlushPeriod {
+		t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
+	}
+}
+
+func TestReadBlackboxState(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "blackbox-state.json")
+	want := BlackboxState{
+		Status:           "running",
+		BootStartedAtUTC: "2026-04-24T00:00:00Z",
+		BootFolder:       "boot-folder",
+		UpdatedAtUTC:     "2026-04-24T00:00:01Z",
+		Targets: []BlackboxTargetStatus{{
+			EnrollmentID: "bb-1",
+			Device:       "/dev/sdb1",
+			Status:       "running",
+			FlushPeriod:  "1s",
+		}},
+	}
+	if err := writeJSONAtomic(path, want); err != nil {
+		t.Fatalf("writeJSONAtomic: %v", err)
+	}
+	got, err := ReadBlackboxState(path)
+	if err != nil {
+		t.Fatalf("ReadBlackboxState: %v", err)
+	}
+	if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
+		t.Fatalf("state=%+v", got)
+	}
+}
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
-	data, err := os.ReadFile(path)
+	data, err := readFileLimited(path, 10<<20)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
@@ -2,6 +2,7 @@ package app

 import (
 	"archive/tar"
+	"bee/audit/internal/platform"
 	"compress/gzip"
 	"fmt"
 	"io"
@@ -14,6 +15,7 @@ import (
 )

 var supportBundleServices = []string{
+	"bee-blackbox.service",
 	"bee-audit.service",
 	"bee-web.service",
 	"bee-network.service",
@@ -22,6 +24,8 @@ var supportBundleServices = []string{
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
+	"nvidia-dcgm.service",
+	"nvidia-fabricmanager.service",
 }

 var supportBundleCommands = []struct {
@@ -48,6 +52,43 @@ else
 fi
 `}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
+if command -v nvidia-smi >/dev/null 2>&1; then
+  nvidia-smi topo -m 2>&1 || true
+else
+  echo "nvidia-smi not found"
+fi
+`}},
+	{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
+if ! command -v systemctl >/dev/null 2>&1; then
+  echo "systemctl not found"
+  exit 0
+fi
+echo "=== unit files ==="
+systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== active units ==="
+systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== failed units ==="
+systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
+`}},
+	{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
+for candidate in \
+  /usr/bin/nvidia-fabricmanager \
+  /usr/bin/nv-fabricmanager \
+  /usr/bin/nvidia-fabricmanagerd \
+  /usr/bin/nvlsm; do
+  if [ -e "$candidate" ]; then
+    echo "=== $candidate ==="
+    ls -l "$candidate" 2>&1 || true
+    echo
+  fi
+done
+if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
+  echo "no fabric manager binaries found"
+fi
+`}},
 	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
 if ! command -v lspci >/dev/null 2>&1; then
  echo "lspci not found"
@@ -195,6 +236,10 @@ var supportBundleOptionalFiles = []struct {
 }{
 	{name: "system/kern.log", src: "/var/log/kern.log"},
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
+	{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
+	{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
+	{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
+	{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
 }

 const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
@@ -212,11 +257,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	}

 	now := time.Now().UTC()
-	date := now.Format("2006-01-02")
-	tod := now.Format("150405")
-	ver := bundleVersion()
-	model := serverModelForBundle()
-	sn := serverSerialForBundle()

 	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
@@ -250,7 +290,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
+	archiveName := SupportBundleBaseName(now) + ".tar.gz"
 	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
@@ -258,6 +298,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	return archivePath, nil
 }

+func SupportBundleBaseName(at time.Time) string {
+	at = at.UTC()
+	date := at.Format("2006-01-02")
+	tod := at.Format("150405")
+	ver := bundleVersion()
+	model := serverModelForBundle()
+	sn := serverSerialForBundle()
+	return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
+}
+
 func LatestSupportBundlePath() (string, error) {
 	return latestSupportBundlePath(os.TempDir())
 }
@@ -381,6 +431,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
 	fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
+		fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
+		fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
+		if strings.TrimSpace(cfg.Reason) != "" {
+			fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
+		}
+	}
 	fmt.Fprintf(&body, "\nfiles:\n")

 	var files []string
@@ -4,7 +4,9 @@ import (
 	"bee/audit/internal/schema"
 	"fmt"
 	"log/slog"
+	"os"
 	"os/exec"
+	"path/filepath"
 	"strconv"
 	"strings"
 )
@@ -140,6 +142,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 		} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
 			dev.NUMANode = &numaNode
 		}
+		if group, ok := readPCIIOMMUGroup(bdf); ok {
+			dev.IOMMUGroup = &group
+		}
 		if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
 			dev.LinkWidth = &width
 		}
@@ -179,6 +184,21 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 	return dev
 }

+// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
+// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
+func readPCIIOMMUGroup(bdf string) (int, bool) {
+	link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
+	target, err := os.Readlink(link)
+	if err != nil {
+		return 0, false
+	}
+	n, err := strconv.Atoi(filepath.Base(target))
+	if err != nil {
+		return 0, false
+	}
+	return n, true
+}
+
 // readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
 func readPCIIDs(bdf string) (vendorID, deviceID int) {
 	base := "/sys/bus/pci/devices/" + bdf
@@ -160,11 +160,57 @@ type psuSDR struct {
 }

 var psuSlotPatterns = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
+	// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
+	// does not fire after the digit; match explicitly with underscore terminator.
+	regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
+	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),                    // PSU1, PS1, ps 2
+	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),                      // PS 6, PS6
+	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),                     // PWS1
+	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
+	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),                     // Bay 1
+	// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
+	// Must be last: "power supply N" is already caught by the pattern above.
+	regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
+}
+
+// psuInputPowerKeywords matches AC-input power sensor names across vendors:
+//   MSI:     PSU1_POWER_IN, PSU1_PIN
+//   MLT:     PSU1_PIN
+//   xFusion: (matched via default fallback — no explicit keyword)
+//   HPE:     PS1 Input Power, PS1 Input Watts
+func isPSUInputPower(name string) bool {
+	return strings.Contains(name, "input power") ||
+		strings.Contains(name, "input watts") ||
+		strings.Contains(name, "_pin") ||
+		strings.Contains(name, " pin") ||
+		strings.Contains(name, "_power_in") ||
+		strings.Contains(name, "power_in")
+}
+
+// isPSUOutputPower matches DC-output power sensor names across vendors:
+//   MSI:     PSU1_POWER_OUT
+//   MLT:     PSU1_POUT
+//   xFusion: PS1 POut
+func isPSUOutputPower(name string) bool {
+	return strings.Contains(name, "output power") ||
+		strings.Contains(name, "output watts") ||
+		strings.Contains(name, "_pout") ||
+		strings.Contains(name, " pout") ||
+		strings.Contains(name, "_power_out") ||
+		strings.Contains(name, "power_out") ||
+		strings.Contains(name, "power supply bay") ||
+		strings.Contains(name, "psu bay")
+}
+
+// parseBoundedFloat parses a numeric value from an SDR value field and
+// validates it is within (0, max]. Returns nil for zero, negative, or
+// out-of-range values — these indicate missing/off/fault sensor readings.
+func parseBoundedFloat(raw string, max float64) *float64 {
+	v := parseFloatPtr(raw)
+	if v == nil || *v <= 0 || *v > max {
+		return nil
+	}
+	return v
 }

 func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +240,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {

 		lowerName := strings.ToLower(name)
 		switch {
-		case strings.Contains(lowerName, "input power"):
-			entry.inputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "output power"):
-			entry.outputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
-			entry.outputPowerW = parseFloatPtr(value)
+		case isPSUInputPower(lowerName):
+			entry.inputPowerW = parseBoundedFloat(value, 6000)
+		case isPSUOutputPower(lowerName):
+			entry.outputPowerW = parseBoundedFloat(value, 6000)
 		case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
 			entry.inputVoltage = parseFloatPtr(value)
 		case strings.Contains(lowerName, "temp"):
 			entry.temperatureC = parseFloatPtr(value)
 		case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
 			entry.healthPct = parsePercentPtr(value)
+		default:
+			// Generic PSU power reading: sensor matched a slot pattern but carries
+			// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
+			// AC input if the value looks like wattage and no better data is set yet.
+			if entry.inputPowerW == nil {
+				entry.inputPowerW = parseBoundedFloat(value, 6000)
+			}
 		}
 		out[slot] = entry
 	}
 	return out
 }

+// PSUSlotPower holds SDR power readings for one PSU slot.
+// Slot key used by PSUSlotsFromSDR is the 0-based index string,
+// matching HardwarePowerSupply.Slot in the audit schema.
+type PSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`
+	OutputW *float64 `json:"output_w,omitempty"`
+	Status  string   `json:"status,omitempty"`
+}
+
+// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
+// using the same battle-tested slot patterns as the hardware audit collector.
+// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
+// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
+func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
+	sdr := parsePSUSDR(sdrOutput)
+	if len(sdr) == 0 {
+		return nil
+	}
+	out := make(map[string]PSUSlotPower, len(sdr))
+	for slot, entry := range sdr {
+		key := strconv.Itoa(slot - 1) // audit uses 0-based slot
+		out[key] = PSUSlotPower{
+			InputW:  entry.inputPowerW,
+			OutputW: entry.outputPowerW,
+			Status:  entry.status,
+		}
+	}
+	return out
+}
+
 func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
 	if len(sdr) == 0 {
 		return nil
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 		{name: "PWS1 Status", want: 1},
 		{name: "Power Supply Bay 8", want: 8},
 		{name: "PS 6 Input Power", want: 6},
+		// MSI underscore format — \b does not fire between digit and '_'
+		{name: "PSU1_POWER_IN", want: 1},
+		{name: "PSU2_POWER_OUT", want: 2},
+		{name: "PSU4_STATUS", want: 4},
 	}

 	for _, tt := range tests {
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 	}
 }

+func TestParsePSUSDRMSIFormat(t *testing.T) {
+	t.Parallel()
+	raw := `
+PSU1_STATUS      | F1h | ok
+PSU1_POWER_OUT   | 928 Watts | ok
+PSU1_POWER_IN    | 976 Watts | ok
+PSU2_STATUS      | F2h | ok
+PSU2_POWER_OUT   | 944 Watts | ok
+PSU2_POWER_IN    | 992 Watts | ok
+`
+	got := parsePSUSDR(raw)
+	if len(got) != 2 {
+		t.Fatalf("len(got)=%d want 2", len(got))
+	}
+	if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
+		t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
+	}
+	if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
+		t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
+	}
+	if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
+		t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
+	}
+}
+
 func TestSynthesizePSUsFromSDR(t *testing.T) {
 	t.Parallel()

@@ -250,6 +250,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 	}

 	var info smartctlInfo
+	var raw map[string]any
+	_ = json.Unmarshal(out, &raw)
 	if err := json.Unmarshal(out, &info); err == nil {
 		if v := cleanDMIValue(info.ModelName); v != "" {
 			s.Model = &v
@@ -302,8 +304,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 				value := float64(attr.Raw.Value)
 				s.LifeRemainingPct = &value
 			case 241:
-				value := attr.Raw.Value
+				value := smartLBAsToBytes(attr.Raw.Value)
 				s.WrittenBytes = &value
+			case 242:
+				value := smartLBAsToBytes(attr.Raw.Value)
+				s.ReadBytes = &value
 			case 197:
 				pending = attr.Raw.Value
 				s.CurrentPendingSectors = &pending
@@ -321,6 +326,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 			offlineUncorrectable: uncorrectable,
 			lifeRemainingPct:     lifeRemaining,
 		}
+		applySCSISmartctlTelemetry(&s, raw, &status)
 		setStorageHealthStatus(&s, status)
 		return s
 	}
@@ -477,6 +483,127 @@ func nvmeDataUnitsToBytes(units int64) int64 {
 	return units * 512000
 }

+func smartLBAsToBytes(lbas int64) int64 {
+	if lbas <= 0 {
+		return 0
+	}
+	return lbas * 512
+}
+
+func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
+	if s == nil || len(raw) == 0 {
+		return
+	}
+	if v, ok := firstInt64(raw,
+		"path:power_on_time.hours",
+		"path:accumulated_power_on_time.hours",
+		"path:power_on_time.hour",
+		"path:accumulated_power_on_time.hour",
+	); ok && v > 0 && s.PowerOnHours == nil {
+		s.PowerOnHours = &v
+	}
+	if v, ok := firstInt64(raw,
+		"path:power_cycle_count",
+		"path:start_stop_cycle_count",
+		"path:accumulated_start_stop_cycles",
+	); ok && v > 0 && s.PowerCycles == nil {
+		s.PowerCycles = &v
+	}
+	if v, ok := firstInt64(raw,
+		"path:scsi_grown_defect_list",
+		"path:grown_defect_list",
+	); ok && v > 0 && s.ReallocatedSectors == nil {
+		s.ReallocatedSectors = &v
+		if status != nil && status.reallocatedSectors == 0 {
+			status.reallocatedSectors = v
+		}
+	}
+	if v, ok := firstInt64(raw,
+		"path:percentage_used_endurance_indicator",
+		"path:scsi_percentage_used_endurance_indicator",
+	); ok && v > 0 {
+		if s.LifeUsedPct == nil {
+			fv := float64(v)
+			s.LifeUsedPct = &fv
+		}
+		if s.LifeRemainingPct == nil && v <= 100 {
+			remaining := float64(100 - v)
+			s.LifeRemainingPct = &remaining
+			if status != nil && status.lifeRemainingPct == 0 {
+				status.lifeRemainingPct = int64(remaining)
+			}
+		}
+	}
+	blockSize, hasBlockSize := firstInt64(raw,
+		"path:logical_block_size",
+		"path:block_size",
+		"path:user_capacity.block_size",
+	)
+	if hasBlockSize && blockSize > 0 {
+		if v, ok := firstInt64(raw,
+			"path:logical_blocks_written",
+			"path:total_lbas_written",
+		); ok && v > 0 && s.WrittenBytes == nil {
+			bytes := v * blockSize
+			s.WrittenBytes = &bytes
+		}
+		if v, ok := firstInt64(raw,
+			"path:logical_blocks_read",
+			"path:total_lbas_read",
+		); ok && v > 0 && s.ReadBytes == nil {
+			bytes := v * blockSize
+			s.ReadBytes = &bytes
+		}
+	}
+}
+
+func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
+	for _, candidate := range candidates {
+		if !strings.HasPrefix(candidate, "path:") {
+			continue
+		}
+		path := strings.TrimPrefix(candidate, "path:")
+		if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
+			return v, true
+		}
+	}
+	return 0, false
+}
+
+func nestedInt64(root map[string]any, path []string) (int64, bool) {
+	var current any = root
+	for _, key := range path {
+		obj, ok := current.(map[string]any)
+		if !ok {
+			return 0, false
+		}
+		current, ok = obj[key]
+		if !ok {
+			return 0, false
+		}
+	}
+	switch v := current.(type) {
+	case float64:
+		return int64(v), true
+	case float32:
+		return int64(v), true
+	case int:
+		return int64(v), true
+	case int64:
+		return v, true
+	case int32:
+		return int64(v), true
+	case json.Number:
+		n, err := v.Int64()
+		return n, err == nil
+	case string:
+		n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
+		return n, err == nil
+	default:
+		return 0, false
+	}
+}
+
 type storageHealthStatus struct {
 	hasOverall           bool
 	overallPassed        bool
@@ -0,0 +1,89 @@
+package collector
+
+import (
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestApplySCSISmartctlTelemetry(t *testing.T) {
+	t.Parallel()
+
+	raw := map[string]any{
+		"power_on_time": map[string]any{
+			"hours": float64(32123),
+		},
+		"accumulated_start_stop_cycles":       float64(17),
+		"scsi_grown_defect_list":              float64(4),
+		"percentage_used_endurance_indicator": float64(12),
+		"logical_block_size":                  float64(4096),
+		"logical_blocks_written":              float64(1000),
+		"logical_blocks_read":                 float64(2000),
+	}
+
+	var disk schema.HardwareStorage
+	status := storageHealthStatus{}
+	applySCSISmartctlTelemetry(&disk, raw, &status)
+
+	if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
+		t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
+	}
+	if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
+		t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
+	}
+	if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
+		t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
+	}
+	if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
+		t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
+	}
+	if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
+		t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
+	}
+	if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
+		t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
+	}
+	if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
+		t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
+	}
+	if status.reallocatedSectors != 4 {
+		t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
+	}
+	if status.lifeRemainingPct != 88 {
+		t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
+	}
+}
+
+func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
+	t.Parallel()
+
+	powerOnHours := int64(10)
+	writtenBytes := int64(20)
+	lifeRemaining := 30.0
+	disk := schema.HardwareStorage{
+		PowerOnHours:     &powerOnHours,
+		WrittenBytes:     &writtenBytes,
+		LifeRemainingPct: &lifeRemaining,
+	}
+	raw := map[string]any{
+		"power_on_time":                       map[string]any{"hours": float64(999)},
+		"logical_block_size":                  float64(512),
+		"logical_blocks_written":              float64(999),
+		"percentage_used_endurance_indicator": float64(50),
+	}
+
+	applySCSISmartctlTelemetry(&disk, raw, nil)
+
+	if *disk.PowerOnHours != 10 {
+		t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
+	}
+	if *disk.WrittenBytes != 20 {
+		t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
+	}
+	if *disk.LifeRemainingPct != 30 {
+		t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
+	}
+	if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
+		t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
+	}
+}
@@ -0,0 +1,25 @@
+package collector
+
+import "testing"
+
+func TestSmartLBAsToBytes(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		lbas int64
+		want int64
+	}{
+		{name: "zero", lbas: 0, want: 0},
+		{name: "single lba", lbas: 1, want: 512},
+		{name: "multiple lbas", lbas: 2048, want: 1048576},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := smartLBAsToBytes(tt.lbas); got != tt.want {
+				t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
+			}
+		})
+	}
+}
@@ -0,0 +1,735 @@
+package platform
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+)
+
+const (
+	benchmarkPowerAutotuneVersion         = 1
+	benchmarkPowerAutotuneIdleSec         = 60
+	benchmarkPowerAutotuneLoadSec         = 90
+	benchmarkPowerAutotuneSampleInterval  = 3
+	defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
+)
+
+func BenchmarkPowerSourceConfigPath(baseDir string) string {
+	baseDir = strings.TrimSpace(baseDir)
+	if baseDir == "" {
+		return defaultBenchmarkPowerSourceConfigPath
+	}
+	return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
+}
+
+func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var cfg BenchmarkPowerAutotuneConfig
+	if err := json.Unmarshal(raw, &cfg); err != nil {
+		return nil, err
+	}
+	if strings.TrimSpace(cfg.SelectedSource) == "" {
+		return nil, fmt.Errorf("autotune config missing selected_source")
+	}
+	return &cfg, nil
+}
+
+func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if cfg.Version <= 0 {
+		cfg.Version = benchmarkPowerAutotuneVersion
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(cfg, "", "  ")
+	if err != nil {
+		return err
+	}
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmp, path)
+}
+
+func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
+	return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
+}
+
+func ResetBenchmarkPowerAutotuneConfig(path string) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
+
+func normalizeBenchmarkPowerSource(source string) string {
+	switch strings.TrimSpace(strings.ToLower(source)) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		return BenchmarkPowerSourceSDRPSUInput
+	default:
+		return BenchmarkPowerSourceDCMI
+	}
+}
+
+func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
+	cfg, err := LoadSystemPowerSourceConfig(exportDir)
+	if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
+		selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
+		return SystemPowerSourceDecision{
+			Configured:      true,
+			SelectedSource:  selected,
+			EffectiveSource: selected,
+			Mode:            "autotuned",
+			Reason:          strings.TrimSpace(cfg.Reason),
+			ConfiguredAt:    cfg.UpdatedAt,
+		}
+	}
+
+	sources := sampleBenchmarkPowerSources()
+	if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
+		return SystemPowerSourceDecision{
+			Configured:      false,
+			EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
+			Mode:            "fallback",
+			Reason:          "autotune config not found; using temporary fallback source sdr_psu_input",
+		}
+	}
+	return SystemPowerSourceDecision{
+		Configured:      false,
+		EffectiveSource: BenchmarkPowerSourceDCMI,
+		Mode:            "fallback",
+		Reason:          "autotune config not found; using temporary fallback source dcmi",
+	}
+}
+
+func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
+	decision := ResolveSystemPowerDecision(exportDir)
+	if decision.EffectiveSource != "" {
+		if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
+			return value, decision, nil
+		} else if decision.Configured {
+			fallback := BenchmarkPowerSourceDCMI
+			if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
+				fallback = BenchmarkPowerSourceSDRPSUInput
+			}
+			if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
+				decision.Mode = "degraded"
+				decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
+				decision.EffectiveSource = fallback
+				return fallbackValue, decision, nil
+			}
+			decision.Mode = "degraded"
+			decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
+			return 0, decision, err
+		}
+	}
+	return 0, decision, fmt.Errorf("system power source unavailable")
+}
+
+func queryBenchmarkPowerSourceW(source string) (float64, error) {
+	switch normalizeBenchmarkPowerSource(source) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		sdr := sampleIPMISDRPowerSensors()
+		if sdr.PSUInW > 0 {
+			return sdr.PSUInW, nil
+		}
+		return 0, fmt.Errorf("sdr psu input unavailable")
+	default:
+		return queryIPMIServerPowerW()
+	}
+}
+
+func sampleBenchmarkPowerSources() map[string]float64 {
+	out := map[string]float64{}
+	if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
+		out[BenchmarkPowerSourceDCMI] = w
+	}
+	if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
+		out[BenchmarkPowerSourceSDRPSUInput] = w
+	}
+	return out
+}
+
+func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
+	if durationSec <= 0 {
+		return 0, false
+	}
+	samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
+func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
+	if durationSec <= 0 {
+		return nil
+	}
+	stopCh := make(chan struct{})
+	doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Duration(durationSec) * time.Second):
+	}
+	close(stopCh)
+	return <-doneCh
+}
+
+func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan []float64, 1)
+	go func() {
+		defer close(ch)
+		var samples []float64
+		record := func() {
+			if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
+				samples = append(samples, w)
+			}
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
+type benchmarkPowerAutotuneSample struct {
+	ElapsedSec     float64
+	GPUAvgUsagePct float64
+	CPUUsagePct    float64
+	GPUSumPowerW   float64
+	Sources        map[string]float64
+}
+
+func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
+	if durationSec <= 0 {
+		return nil
+	}
+	var out []benchmarkPowerAutotuneSample
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	start := time.Now()
+	for {
+		if ctx.Err() != nil {
+			return out
+		}
+		row := benchmarkPowerAutotuneSample{
+			ElapsedSec:  time.Since(start).Seconds(),
+			CPUUsagePct: sampleCPULoadPct(),
+			Sources:     sampleBenchmarkPowerSources(),
+		}
+		if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
+			var usageSum float64
+			for _, gpu := range gpuRows {
+				row.GPUSumPowerW += gpu.PowerW
+				usageSum += gpu.UsagePct
+			}
+			row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
+		}
+		out = append(out, row)
+		logBenchmarkPowerAutotuneSample(phase, row, logFunc)
+		if time.Now().After(deadline) {
+			return out
+		}
+		select {
+		case <-ctx.Done():
+			return out
+		case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
+		}
+	}
+}
+
+func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		if value, ok := sample.Sources[source]; ok && value > 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
+		} else {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
+		}
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
+		phase,
+		sample.ElapsedSec,
+		sample.GPUAvgUsagePct,
+		sample.GPUSumPowerW,
+		sample.CPUUsagePct,
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil || len(samples) == 0 {
+		return
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	var gpuPower []float64
+	sourceBuckets := map[string][]float64{}
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		gpuPower = append(gpuPower, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				sourceBuckets[source] = append(sourceBuckets[source], value)
+			}
+		}
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		values := sourceBuckets[source]
+		if len(values) == 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
+			continue
+		}
+		sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
+		phase,
+		len(samples),
+		benchmarkMean(gpuUsage),
+		benchmarkPercentile(gpuUsage, 95),
+		benchmarkMean(gpuPower),
+		benchmarkMean(cpuUsage),
+		benchmarkPercentile(cpuUsage, 95),
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	for _, candidate := range candidates {
+		if !candidate.Available {
+			logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
+			continue
+		}
+		logFunc(fmt.Sprintf(
+			"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
+			candidate.Source,
+			candidate.IdleAvgW,
+			candidate.LoadAvgW,
+			candidate.DeltaW,
+			gpuDelta,
+			candidate.RelativeError,
+			candidate.Confidence*100,
+			map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
+		))
+		if strings.TrimSpace(candidate.SelectionNotes) != "" {
+			logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
+		}
+	}
+}
+
+func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
+	result := &BenchmarkPowerAutotuneValidation{}
+	if len(samples) == 0 {
+		result.Reason = "no idle telemetry samples collected"
+		return result
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		if sample.CPUUsagePct > 0 {
+			cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		}
+	}
+	result.GPUSamples = len(gpuUsage)
+	result.CPUSamples = len(cpuUsage)
+	result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
+	result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
+	result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
+	result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
+	switch {
+	case result.GPUAvgUsagePct > 5:
+		result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
+	case result.GPUP95UsagePct > 10:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
+	case result.CPUAvgUsagePct > 20:
+		result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
+	case result.CPUP95UsagePct > 35:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
+	default:
+		result.Valid = true
+	}
+	return result
+}
+
+func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
+	idleBySource := map[string][]float64{}
+	loadBySource := map[string][]float64{}
+	var idleGPU []float64
+	var loadGPU []float64
+	for _, sample := range idle {
+		idleGPU = append(idleGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				idleBySource[source] = append(idleBySource[source], value)
+			}
+		}
+	}
+	for _, sample := range load {
+		loadGPU = append(loadGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				loadBySource[source] = append(loadBySource[source], value)
+			}
+		}
+	}
+	idleGPUAvg := benchmarkMean(idleGPU)
+	loadGPUAvg := benchmarkMean(loadGPU)
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+
+	candidates := []BenchmarkPowerAutotuneCandidate{
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
+	}
+	available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
+	for _, candidate := range candidates {
+		if candidate.Available && candidate.DeltaW > 0 {
+			available = append(available, candidate)
+		}
+	}
+	if len(available) == 0 {
+		return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
+	}
+	sort.Slice(available, func(i, j int) bool {
+		if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
+			if available[i].Source != available[j].Source {
+				return available[i].Source == BenchmarkPowerSourceSDRPSUInput
+			}
+		}
+		if available[i].RelativeError != available[j].RelativeError {
+			return available[i].RelativeError < available[j].RelativeError
+		}
+		return available[i].Samples > available[j].Samples
+	})
+	selected := available[0]
+	for idx := range candidates {
+		if candidates[idx].Source == selected.Source {
+			candidates[idx].Selected = true
+			candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
+		}
+	}
+	return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
+}
+
+func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
+	candidate := BenchmarkPowerAutotuneCandidate{
+		Source:    source,
+		Available: len(idle) > 0 && len(load) > 0,
+		Samples:   minInt(len(idle), len(load)),
+	}
+	if !candidate.Available {
+		return candidate
+	}
+	candidate.IdleAvgW = benchmarkMean(idle)
+	candidate.LoadAvgW = benchmarkMean(load)
+	candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
+	if gpuDelta > 0 {
+		candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
+		candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
+	}
+	return candidate
+}
+
+func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
+	fmt.Fprintf(&b, "status=%s\n", result.Status)
+	fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "profile=%s\n", result.Profile)
+	fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
+	}
+	if result.IdleValidation != nil {
+		fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
+		}
+	}
+	for _, candidate := range result.Candidates {
+		fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
+		if candidate.Available {
+			fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
+			fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
+			fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
+			fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
+		}
+	}
+	return b.String()
+}
+
+func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	b.WriteString("# Bee Bench Power Source Autotune\n\n")
+	fmt.Fprintf(&b, "**Status:** %s  \n", result.Status)
+	fmt.Fprintf(&b, "**Benchmark kind:** %s  \n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.Profile)
+	fmt.Fprintf(&b, "**Idle window:** %ds  \n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "**Load window:** %ds  \n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "**Sample interval:** %ds  \n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "**Selected source:** `%s`  \n", result.SelectedSource)
+	}
+	b.WriteString("\n")
+	if result.IdleValidation != nil {
+		b.WriteString("## Idle Validation\n\n")
+		fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
+		}
+		b.WriteString("\n")
+	}
+	if len(result.Candidates) > 0 {
+		b.WriteString("## Candidates\n\n")
+		b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
+		b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
+		for _, candidate := range result.Candidates {
+			if !candidate.Available {
+				fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
+				continue
+			}
+			selected := "no"
+			if candidate.Selected {
+				selected = "yes"
+			}
+			fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
+				candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
+		}
+		b.WriteString("\n")
+	}
+	for _, note := range result.Notes {
+		fmt.Fprintf(&b, "- %s\n", note)
+	}
+	return b.String()
+}
+
+func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
+	allDevices := joinIndexList(gpuIndices)
+	switch strings.TrimSpace(strings.ToLower(kind)) {
+	case "power-fit", "power", "nvidia-bench-power":
+		cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
+		if err == nil {
+			return cmd, "power-fit"
+		}
+		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
+	default:
+		cmd := []string{
+			"bee-gpu-burn",
+			"--seconds", fmt.Sprintf("%d", durationSec),
+			"--devices", allDevices,
+		}
+		if sizeMB > 0 {
+			cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
+		}
+		return cmd, "performance"
+	}
+}
+
+func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = "/var/log/bee-bench/autotune"
+	}
+	if err := os.MkdirAll(baseDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
+	}
+	selected, err := resolveNvidiaGPUSelection(nil, nil)
+	if err != nil {
+		return "", err
+	}
+	if len(selected) == 0 {
+		return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "autotune-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+	hostname, _ := os.Hostname()
+	loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
+	result := BenchmarkPowerAutotuneResult{
+		GeneratedAt:       time.Now().UTC(),
+		Hostname:          hostname,
+		ServerModel:       readServerModel(),
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		Status:            "FAILED",
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+	}
+
+	logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
+	idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
+	logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
+	result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
+	if result.IdleValidation == nil || !result.IdleValidation.Valid {
+		if result.IdleValidation != nil {
+			result.IdleValidationError = result.IdleValidation.Reason
+			logFunc(result.IdleValidation.Reason)
+		}
+		result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("%s", result.IdleValidationError)
+	}
+
+	logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
+	loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
+	go func() {
+		loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
+	}()
+	out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
+	loadSamples := <-loadSamplesCh
+	logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
+	if runErr != nil {
+		result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("autotune load stage: %w", runErr)
+	}
+
+	selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
+	result.Candidates = candidates
+	result.GPUPowerIdleW = idleGPUAvg
+	result.GPUPowerLoadW = loadGPUAvg
+	if chooseErr != nil {
+		result.Notes = append(result.Notes, chooseErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, chooseErr
+	}
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+	logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
+	result.SelectedSource = selectedSource
+	result.Status = "OK"
+	var confidence float64
+	selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
+	for _, candidate := range candidates {
+		if candidate.Selected {
+			confidence = candidate.Confidence
+			if strings.TrimSpace(candidate.SelectionNotes) != "" {
+				selectionReason = candidate.SelectionNotes
+			}
+			break
+		}
+	}
+	cfg := BenchmarkPowerAutotuneConfig{
+		Version:           benchmarkPowerAutotuneVersion,
+		UpdatedAt:         time.Now().UTC(),
+		SelectedSource:    selectedSource,
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+		Confidence:        confidence,
+		Reason:            selectionReason,
+	}
+	result.Config = &cfg
+	configPath := BenchmarkPowerSourceConfigPath(baseDir)
+	if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
+		result.Status = "FAILED"
+		result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
+		if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
+			return "", writeErr
+		}
+		return runDir, err
+	}
+	logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
+	result.Notes = append(result.Notes, "saved autotune config to "+configPath)
+	if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+		return "", err
+	}
+	return runDir, nil
+}
+
+func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
+	resultJSON, err := json.MarshalIndent(result, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshal autotune result: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
+		return fmt.Errorf("write autotune result.json: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune summary.txt: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune report.md: %w", err)
+	}
+	return nil
+}
+
+func minInt(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+var _ = exec.ErrNotFound
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
+	if result.PlatformPowerScore > 0 {
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")

@@ -81,69 +84,164 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 	}

-	// ── Methodology ───────────────────────────────────────────────────────────
-	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
-	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
-	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
-	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
-	b.WriteString("**Compute score** is derived from two phases:\n\n")
-	b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
-	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
-	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
-	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
-	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
-	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
-	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
-	b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
-	b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
-	b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
+	// ── Balanced Scorecard ────────────────────────────────────────────────────
+	b.WriteString("## Balanced Scorecard\n\n")

-	// ── Scorecard table ───────────────────────────────────────────────────────
-	b.WriteString("## Scorecard\n\n")
-	b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
-	b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
-	for _, gpu := range result.GPUs {
-		name := strings.TrimSpace(gpu.Name)
-		if name == "" {
-			name = "Unknown GPU"
+	// Perspective 1: Compatibility — hard stops
+	b.WriteString("### 1. Compatibility\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			thermalThrottle := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			fanAtThrottle := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
+				fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			ecc := "-"
+			if gpu.ECC.Uncorrected > 0 {
+				ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
+			}
+			compatStatus := "✓ OK"
+			if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
+				compatStatus = "⛔ HARD STOP"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
 		}
-		interconnect := "-"
-		if gpu.Scores.InterconnectScore > 0 {
-			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
-		}
-		topsPerSM := "-"
-		if gpu.Scores.TOPSPerSMPerGHz > 0 {
-			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
-		}
-		synthetic := "-"
-		if gpu.Scores.SyntheticScore > 0 {
-			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
-		}
-		mixed := "-"
-		if gpu.Scores.MixedScore > 0 {
-			mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
-		}
-		mixedEff := "-"
-		if gpu.Scores.MixedEfficiency > 0 {
-			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
-		}
-		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
-			gpu.Index, name,
-			gpu.Status,
-			gpu.Scores.CompositeScore,
-			gpu.Scores.ComputeScore,
-			synthetic,
-			mixed,
-			mixedEff,
-			topsPerSM,
-			gpu.Scores.PowerSustainScore,
-			gpu.Scores.ThermalSustainScore,
-			gpu.Scores.StabilityScore,
-			interconnect,
-		)
+		b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 2: Thermal headroom
+	b.WriteString("### 2. Thermal Headroom\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			shutdownTemp := gpu.ShutdownTempC
+			if shutdownTemp <= 0 {
+				shutdownTemp = 90
+			}
+			slowdownTemp := gpu.SlowdownTempC
+			if slowdownTemp <= 0 {
+				slowdownTemp = 80
+			}
+			headroom := gpu.Scores.TempHeadroomC
+			thermalStatus := "✓ OK"
+			switch {
+			case headroom < 10:
+				thermalStatus = "⛔ CRITICAL"
+			case gpu.Steady.P95TempC >= slowdownTemp:
+				thermalStatus = "⚠ WARNING"
+			}
+			throttlePct := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
+				fmt.Sprintf("%.0f°C", slowdownTemp),
+				fmt.Sprintf("%.0f°C", shutdownTemp),
+				fmt.Sprintf("%.1f°C", headroom),
+				throttlePct,
+				thermalStatus,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 3: Power delivery
+	b.WriteString("### 3. Power Delivery\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			powerCap := "-"
+			if gpu.Scores.PowerCapThrottlePct > 0 {
+				powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
+			}
+			fanDuty := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
+				fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			powerStatus := "✓ OK"
+			if gpu.Scores.PowerCapThrottlePct > 5 {
+				powerStatus = "⚠ POWER LIMITED"
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				powerCap,
+				fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
+				fanDuty,
+				powerStatus,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 4: Performance
+	b.WriteString("### 4. Performance\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			synthetic := "-"
+			if gpu.Scores.SyntheticScore > 0 {
+				synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+			}
+			mixed := "-"
+			if gpu.Scores.MixedScore > 0 {
+				mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+			}
+			mixedEff := "-"
+			if gpu.Scores.MixedEfficiency > 0 {
+				mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
+			}
+			topsPerSM := "-"
+			if gpu.Scores.TOPSPerSMPerGHz > 0 {
+				topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
+				synthetic, mixed, mixedEff, topsPerSM,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
+		if len(result.PerformanceRampSteps) > 0 {
+			fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
+		}
+		b.WriteString("\n")
+	}
+
+	// Perspective 5: Anomaly flags
+	b.WriteString("### 5. Anomalies\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			eccCorr := "-"
+			if gpu.ECC.Corrected > 0 {
+				eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
+			}
+			syncBoost := "-"
+			if gpu.Scores.SyncBoostThrottlePct > 0 {
+				syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
+			}
+			powerVar := "OK"
+			if gpu.Scores.PowerSustainScore < 70 {
+				powerVar = "⚠ unstable"
+			}
+			thermalVar := "OK"
+			if gpu.Scores.ThermalSustainScore < 70 {
+				thermalVar = "⚠ unstable"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
+		b.WriteString("\n")
 	}
-	b.WriteString("\n")

 	// ── Per GPU detail ────────────────────────────────────────────────────────
 	b.WriteString("## Per-GPU Details\n\n")
@@ -171,13 +269,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
-			fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
+			fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
@@ -186,19 +284,27 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")

 		// Steady-state telemetry
-		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
-		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
-		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
-		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
-		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
-		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
-		b.WriteString("\n")
+		if benchmarkTelemetryAvailable(gpu.Steady) {
+			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+			b.WriteString(fmtMDTable(
+				[]string{"", "Avg", "P95"},
+				[][]string{
+					{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
+					{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
+					{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
+					{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
+					{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
+				},
+			))
+			b.WriteString("\n")
+		} else {
+			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
+		}

 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
-			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
+			var precRows [][]string
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
@@ -210,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
-				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
-					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
-					eccCorr, eccUncorr)
+				precRows = append(precRows, []string{
+					p.Precision, status,
+					fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
+					eccCorr, eccUncorr,
+				})
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
 			b.WriteString("\n")
 		} else {
 			// Legacy: show combined-window variance.
@@ -236,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Precision results
 		if len(gpu.PrecisionResults) > 0 {
 			b.WriteString("**Precision results:**\n\n")
-			b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
+			var presRows [][]string
 			for _, p := range gpu.PrecisionResults {
 				if p.Supported {
-					weightStr := fmt.Sprintf("×%.3g", p.Weight)
-					fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
-						p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
+					presRows = append(presRows, []string{
+						p.Name,
+						fmt.Sprintf("%.2f", p.TeraOpsPerSec),
+						fmt.Sprintf("×%.3g", p.Weight),
+						fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
+						fmt.Sprintf("%d", p.Lanes),
+						fmt.Sprintf("%d", p.Iterations),
+					})
 				} else {
-					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
+					presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
 				}
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
 			b.WriteString("\n")
 		}

@@ -267,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("## Interconnect (NCCL)\n\n")
 		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
-			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
-			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Avg", "Max"},
+				[][]string{
+					{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
+					{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
+				},
+			))
 			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
@@ -280,20 +401,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

-	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	// ── Server Power ───────────────────────────────────────────────────────────
 	if sp := result.ServerPower; sp != nil {
-		b.WriteString("## Server Power (IPMI)\n\n")
+		title := "## Server Power\n\n"
+		if sp.Source != "" {
+			title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
+		}
+		b.WriteString(title)
 		if !sp.Available {
-			b.WriteString("IPMI power measurement unavailable.\n\n")
+			b.WriteString("Server power measurement unavailable.\n\n")
 		} else {
-			b.WriteString("| | Value |\n|---|---|\n")
-			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
-			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
-			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
-			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
-			if sp.ReportingRatio > 0 {
-				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			spRows := [][]string{
+				{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
+				{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
+				{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
+				{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
 			}
+			if sp.ReportingRatio > 0 {
+				spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
+			}
+			b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
 			b.WriteString("\n")
 		}
 		for _, note := range sp.Notes {
@@ -304,19 +431,33 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── PSU Issues ────────────────────────────────────────────────────────────
+	if len(result.PSUIssues) > 0 {
+		b.WriteString("## PSU Issues\n\n")
+		b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
+		for _, issue := range result.PSUIssues {
+			fmt.Fprintf(&b, "- ⛔ %s\n", issue)
+		}
+		b.WriteString("\n")
+	}
+
 	// ── Cooling ───────────────────────────────────────────────────────────────
 	if cooling := result.Cooling; cooling != nil {
 		b.WriteString("## Cooling\n\n")
 		if cooling.Available {
-			b.WriteString("| Metric | Value |\n|--------|-------|\n")
-			fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
+			dutyAvg, dutyP95 := "N/A", "N/A"
 			if cooling.FanDutyCycleAvailable {
-				fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
-				fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
-			} else {
-				b.WriteString("| Average fan duty cycle | N/A |\n")
-				b.WriteString("| P95 fan duty cycle | N/A |\n")
+				dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
+				dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
 			}
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Value"},
+				[][]string{
+					{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
+					{"Average fan duty cycle", dutyAvg},
+					{"P95 fan duty cycle", dutyP95},
+				},
+			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("Cooling telemetry unavailable.\n\n")
@@ -329,6 +470,23 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── Platform Scalability ──────────────────────────────────────────────────
+	if len(result.PerformanceRampSteps) > 0 {
+		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
+		var scalRows [][]string
+		for _, step := range result.PerformanceRampSteps {
+			scalRows = append(scalRows, []string{
+				fmt.Sprintf("%d", step.StepIndex),
+				joinIndexList(step.GPUIndices),
+				fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
+				fmt.Sprintf("%.1f%%", step.ScalabilityPct),
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
+		b.WriteString("\n")
+	}
+
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
@@ -0,0 +1,75 @@
+package platform
+
+import (
+	"strings"
+)
+
+// fmtMDTable renders a markdown table with column widths padded so the table
+// is readable as plain text without a markdown renderer.
+//
+// headers contains the column header strings.
+// rows contains data rows; each row must have the same number of cells as headers.
+// Cells with fewer entries than headers are treated as empty.
+func fmtMDTable(headers []string, rows [][]string) string {
+	ncols := len(headers)
+	if ncols == 0 {
+		return ""
+	}
+
+	// Compute max width per column.
+	widths := make([]int, ncols)
+	for i, h := range headers {
+		if len(h) > widths[i] {
+			widths[i] = len(h)
+		}
+	}
+	for _, row := range rows {
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			if len(cell) > widths[i] {
+				widths[i] = len(cell)
+			}
+		}
+	}
+
+	var b strings.Builder
+
+	// Header row.
+	b.WriteByte('|')
+	for i, h := range headers {
+		b.WriteByte(' ')
+		b.WriteString(h)
+		b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
+		b.WriteString(" |")
+	}
+	b.WriteByte('\n')
+
+	// Separator row.
+	b.WriteByte('|')
+	for i := range headers {
+		b.WriteString(strings.Repeat("-", widths[i]+2))
+		b.WriteByte('|')
+	}
+	b.WriteByte('\n')
+
+	// Data rows.
+	for _, row := range rows {
+		b.WriteByte('|')
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			b.WriteByte(' ')
+			b.WriteString(cell)
+			b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
+			b.WriteString(" |")
+		}
+		b.WriteByte('\n')
+	}
+
+	return b.String()
+}
@@ -1,8 +1,13 @@
 package platform

 import (
+	"context"
+	"fmt"
+	"os/exec"
+	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestResolveBenchmarkProfile(t *testing.T) {
@@ -49,8 +54,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
-	if len(labels) != 7 || len(phases) != 7 {
-		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
+	if len(labels) != 5 || len(phases) != 5 {
+		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +66,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
-	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -80,7 +85,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -99,7 +104,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -133,10 +138,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()

-	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
+	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
-	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
+	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 }
@@ -164,6 +169,99 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	}
 }

+func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
+	t.Parallel()
+
+	before := BenchmarkThrottleCounters{}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
+		t.Fatalf("sw_power_cap should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
+		t.Fatalf("hw_power_brake should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
+		t.Fatalf("hw_thermal mismatch: got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
+		t.Fatalf("sw_thermal mismatch: got %q", got)
+	}
+}
+
+func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 1000 }
+	benchmarkResetNvidiaGPU = func(int) (string, error) {
+		t.Fatal("unexpected reset call")
+		return "", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	var logs []string
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
+		logs = append(logs, line)
+	})
+	if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
+		t.Fatalf("logs=%q want substring %q", got, want)
+	}
+	if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
+		t.Fatalf("failed=%v want [0 2]", failed)
+	}
+}
+
+func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	var calls []int
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		calls = append(calls, index)
+		return "ok\n", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
+	if len(failed) != 0 {
+		t.Fatalf("failed=%v want no failures", failed)
+	}
+	if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
+		t.Fatalf("calls=%v want %s", calls, want)
+	}
+}
+
+func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		if index == 5 {
+			return "busy\n", exec.ErrNotFound
+		}
+		return "ok\n", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
+	if got, want := fmt.Sprint(failed), "[5]"; got != want {
+		t.Fatalf("failed=%v want %s", failed, want)
+	}
+}
+
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()

@@ -179,6 +277,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	}
 }

+func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name string
+		info benchmarkGPUInfo
+		want int
+	}{
+		{
+			name: "prefers default tdp over current derated limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 600,
+				MaxPowerLimitW:     600,
+			},
+			want: 600,
+		},
+		{
+			name: "caps default tdp to reported max limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 700,
+				MaxPowerLimitW:     650,
+			},
+			want: 650,
+		},
+		{
+			name: "falls back to current limit when default missing",
+			info: benchmarkGPUInfo{
+				PowerLimitW:    525,
+				MaxPowerLimitW: 600,
+			},
+			want: 525,
+		},
+		{
+			name: "falls back to max limit when only that is known",
+			info: benchmarkGPUInfo{
+				MaxPowerLimitW: 575,
+			},
+			want: 575,
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
+				t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()

@@ -314,12 +465,40 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	}
 }

-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
+	t.Parallel()
+
+	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
+		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
+			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
+			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
+			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
+		},
+		PrecisionResults: []BenchmarkPrecisionResult{
+			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
+			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
+			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
+		},
+	})
+
+	if score.SyntheticScore != 100 {
+		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
+	}
+	if score.MixedScore != 50 {
+		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
+	}
+}
+
+func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Min Power Limit                       : 200.00 W
+    Max Power Limit                       : 600.00 W
+    Default Power Limit                   : 575.00 W
+    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -341,7 +520,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -355,25 +534,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
+	}
+	if infoByIndex[0].MaxPowerLimitW != 600 {
+		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
+	}
+	if infoByIndex[0].DefaultPowerLimitW != 575 {
+		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
+	}
+	if infoByIndex[0].PowerLimitW != 560 {
+		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
+	}
 }

-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
+    Min Power Limit                       : 100.00 W
+    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
+			Index:               0,
+			BusID:               "00000000:4E:00.0",
+			MaxGraphicsClockMHz: 2430,
+			MaxMemoryClockMHz:   12481,
+			MinPowerLimitW:      200,
+			MaxPowerLimitW:      600,
+		},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
+	}
 }
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
@@ -42,40 +43,151 @@ const (
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )

+const (
+	BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
+	BenchmarkPowerEngineTargetedPower  = "targeted_power"
+)
+
+// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
+// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
+// re-measure from actual task logs and update the constants here.
+//
+// Sources:
+//   - BenchmarkEstimatedPerfStandardSec:   MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
+//   - BenchmarkEstimatedPerfStabilitySec:  xFusion v8.22 ramp 1-8: 5532 s
+//   - BenchmarkEstimatedPerfOvernightSec:  derived from profile phases (SteadySec=27000)
+//   - BenchmarkEstimatedPowerStandardSec:  MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
+//   - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
+const (
+	// Performance Benchmark (bee-gpu-burn).
+	// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
+	// Sequential per-GPU mode scales approximately linearly.
+	BenchmarkEstimatedPerfStandardSec  = 960  // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
+	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
+	BenchmarkEstimatedPerfOvernightSec = 8 * 3600
+
+	// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
+	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
+	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
+	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
+	BenchmarkEstimatedPowerOvernightSec = 3 * 3600
+)
+
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
+	ServerPowerSource string
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }

+const (
+	BenchmarkPowerSourceDCMI        = "dcmi"
+	BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
+)
+
+type BenchmarkPowerAutotuneConfig struct {
+	Version           int       `json:"version"`
+	UpdatedAt         time.Time `json:"updated_at"`
+	SelectedSource    string    `json:"selected_source"`
+	BenchmarkKind     string    `json:"benchmark_kind,omitempty"`
+	Profile           string    `json:"profile,omitempty"`
+	IdleDurationSec   int       `json:"idle_duration_sec,omitempty"`
+	LoadDurationSec   int       `json:"load_duration_sec,omitempty"`
+	SampleIntervalSec int       `json:"sample_interval_sec,omitempty"`
+	Confidence        float64   `json:"confidence,omitempty"`
+	Reason            string    `json:"reason,omitempty"`
+}
+
+type SystemPowerSourceDecision struct {
+	Configured      bool      `json:"configured"`
+	SelectedSource  string    `json:"selected_source,omitempty"`
+	EffectiveSource string    `json:"effective_source,omitempty"`
+	Mode            string    `json:"mode,omitempty"` // autotuned, fallback, degraded
+	Reason          string    `json:"reason,omitempty"`
+	ConfiguredAt    time.Time `json:"configured_at,omitempty"`
+}
+
+type BenchmarkPowerAutotuneResult struct {
+	GeneratedAt         time.Time                         `json:"generated_at"`
+	Hostname            string                            `json:"hostname,omitempty"`
+	ServerModel         string                            `json:"server_model,omitempty"`
+	BenchmarkKind       string                            `json:"benchmark_kind,omitempty"`
+	Profile             string                            `json:"profile,omitempty"`
+	Status              string                            `json:"status"`
+	IdleDurationSec     int                               `json:"idle_duration_sec"`
+	LoadDurationSec     int                               `json:"load_duration_sec"`
+	SampleIntervalSec   int                               `json:"sample_interval_sec"`
+	SelectedSource      string                            `json:"selected_source,omitempty"`
+	IdleValidationError string                            `json:"idle_validation_error,omitempty"`
+	IdleValidation      *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
+	GPUPowerIdleW       float64                           `json:"gpu_power_idle_w,omitempty"`
+	GPUPowerLoadW       float64                           `json:"gpu_power_load_w,omitempty"`
+	Candidates          []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
+	Notes               []string                          `json:"notes,omitempty"`
+	Config              *BenchmarkPowerAutotuneConfig     `json:"config,omitempty"`
+}
+
+type BenchmarkPowerAutotuneValidation struct {
+	Valid          bool    `json:"valid"`
+	GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
+	GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
+	CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
+	CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
+	GPUSamples     int     `json:"gpu_samples,omitempty"`
+	CPUSamples     int     `json:"cpu_samples,omitempty"`
+	Reason         string  `json:"reason,omitempty"`
+}
+
+type BenchmarkPowerAutotuneCandidate struct {
+	Source         string  `json:"source"`
+	IdleAvgW       float64 `json:"idle_avg_w,omitempty"`
+	LoadAvgW       float64 `json:"load_avg_w,omitempty"`
+	DeltaW         float64 `json:"delta_w,omitempty"`
+	Samples        int     `json:"samples,omitempty"`
+	RelativeError  float64 `json:"relative_error,omitempty"`
+	Confidence     float64 `json:"confidence,omitempty"`
+	Selected       bool    `json:"selected,omitempty"`
+	Available      bool    `json:"available"`
+	SelectionNotes string  `json:"selection_notes,omitempty"`
+}
+
 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
-	Findings           []string                     `json:"findings,omitempty"`
-	Warnings           []string                     `json:"warnings,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
-	Cooling            *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	BenchmarkVersion string    `json:"benchmark_version"`
+	GeneratedAt      time.Time `json:"generated_at"`
+	Hostname         string    `json:"hostname,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
+	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
+	// 100% = each added GPU contributes exactly its single-card throughput.
+	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
+	OverallStatus        string                       `json:"overall_status"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
+	Findings             []string                     `json:"findings,omitempty"`
+	Warnings             []string                     `json:"warnings,omitempty"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -107,6 +219,12 @@ type BenchmarkGPUResult struct {
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
+	// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
+	// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
+	ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
+	// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
+	// Fallback: 80°C.
+	SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
@@ -206,25 +324,87 @@ type BenchmarkScorecard struct {
 	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
-	StabilityScore      float64 `json:"stability_score"`
-	InterconnectScore   float64 `json:"interconnect_score"`
-	CompositeScore      float64 `json:"composite_score"`
+	// StabilityScore: fraction of steady-state time the GPU spent throttling
+	// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
+	StabilityScore float64 `json:"stability_score"`
+
+	// Throttle breakdown — percentage of steady-state time in each throttle type.
+	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
+	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
+	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
+	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
+
+	// Temperature headroom: distance to the 100°C destruction threshold.
+	// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
+	// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
+	TempHeadroomC float64 `json:"temp_headroom_c"`
+
+	InterconnectScore float64 `json:"interconnect_score"`
+	// ServerQualityScore (0–100) reflects server infrastructure quality independent
+	// of GPU model. Combines throttle time, power variance, and temp variance.
+	// Use this to compare servers with the same GPU, or to flag a bad server
+	// that throttles an otherwise fast GPU.
+	ServerQualityScore float64 `json:"server_quality_score"`
+	// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
+	// A throttling GPU will score lower here automatically — no quality multiplier.
+	CompositeScore float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }

-// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
-// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
-// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
-// over-reporting its power consumption.
+// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
+// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
+// so benchmark and audit data can be correlated by slot.
+type BenchmarkPSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`  // AC wall input (PSUx_POWER_IN)
+	OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
+	Status  string   `json:"status,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power from multiple independent
+// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
+// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
+// covers only a subset of installed PSUs (partial coverage).
+//
+// Source legend:
+//   - DCMI      — `ipmitool dcmi power reading`; fast but may miss PSUs
+//   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
+//   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool     `json:"available"`
-	IdleW           float64  `json:"idle_w,omitempty"`
-	LoadedW         float64  `json:"loaded_w,omitempty"`
-	DeltaW          float64  `json:"delta_w,omitempty"`
-	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
-	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
-	Notes           []string `json:"notes,omitempty"`
+	Available         bool    `json:"available"`
+	Source            string  `json:"source,omitempty"`
+	Mode              string  `json:"mode,omitempty"`
+	Reason            string  `json:"reason,omitempty"`
+	SampleIntervalSec int     `json:"sample_interval_sec,omitempty"`
+	IdleW             float64 `json:"idle_w,omitempty"`   // DCMI at idle
+	LoadedW           float64 `json:"loaded_w,omitempty"` // DCMI at peak load
+	DeltaW            float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
+	GPUReportedSumW   float64 `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio    float64 `json:"reporting_ratio,omitempty"`
+
+	// PSU AC input sum — sampled at idle and at peak load using collector's
+	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
+	PSUInputIdleW   float64 `json:"psu_input_idle_w,omitempty"`
+	PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
+
+	// PSU DC output sum — power delivered to server internals after conversion.
+	PSUOutputIdleW   float64 `json:"psu_output_idle_w,omitempty"`
+	PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
+
+	// Per-slot PSU readings at idle and at peak load.
+	// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
+	PSUSlotReadingsIdle   map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
+	PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
+
+	// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
+	// PCIe slot delivery only (excludes 16-pin connector power).
+	GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
+
+	// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
+	// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
+	DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
+
+	Notes []string `json:"notes,omitempty"`
 }

 // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
@@ -265,36 +445,92 @@ type NvidiaPowerBenchResult struct {
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
-	Findings             []string               `json:"findings,omitempty"`
-	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
+	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
+	// cumulative thermal ramp. Represents the actual sustained power budget of
+	// this server under full GPU load. Use for rack power planning.
+	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
+	// ServerPower captures IPMI server power delta (idle→loaded) measured in
+	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
+	// actual wall-power draw as seen by the server's power supply.
+	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
+	Findings    []string              `json:"findings,omitempty"`
+	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the power benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type NvidiaPowerBenchGPU struct {
-	Index               int      `json:"index"`
-	Name                string   `json:"name,omitempty"`
-	BusID               string   `json:"bus_id,omitempty"`
-	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
-	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	Index              int     `json:"index"`
+	Name               string  `json:"name,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
+	// AppliedPowerLimitW is the stable limit found during single-card calibration.
+	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
+	// StablePowerLimitW is the final fixed limit for this GPU after the
+	// cumulative thermal ramp. This is the limit at which the GPU operated
+	// stably with all other GPUs running simultaneously at their own limits.
+	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
+	// additional derating.
+	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
-	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
-	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// Telemetry holds the aggregated stats from the final converged calibration
+	// attempt for this GPU (temperature, power, fan, clock percentiles).
+	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
+	// Fan state sampled at the end of single-card calibration.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 }

 type NvidiaPowerBenchStep struct {
-	StepIndex              int      `json:"step_index"`
-	GPUIndices             []int    `json:"gpu_indices"`
-	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
-	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
-	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
-	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
-	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
-	Status                 string   `json:"status"`
-	Notes                  []string `json:"notes,omitempty"`
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// NewGPUIndex is the GPU whose stable limit was searched in this step.
+	NewGPUIndex int `json:"new_gpu_index"`
+	// NewGPUStableLimitW is the stable power limit found for the new GPU.
+	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
+	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
+	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	Notes               []string `json:"notes,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// PSU slot readings sampled at end of this ramp step.
+	PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
+	// Fan state at end of this ramp step.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
+	// Per-GPU telemetry from this step's calibration, keyed by GPU index.
+	PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
+}
+
+// NvidiaPerformanceRampStep holds per-step performance data for the
+// scalability ramp-up phase of the performance benchmark.
+type NvidiaPerformanceRampStep struct {
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
+	// TOPS from dedicated single-precision phases) across all GPUs in this step.
+	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
+	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
+	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
+	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
+	ScalabilityPct float64  `json:"scalability_pct"`
+	Status         string   `json:"status"`
+	Notes          []string `json:"notes,omitempty"`
 }
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
 		dutyAvail := 0
 		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
-		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
-			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
+		dutyEstimated := 0
+		if r.FanDutyCycleEstimated {
+			dutyEstimated = 1
+		}
+		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
+			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
@@ -12,6 +12,7 @@ import (
 )

 const installToRAMDir = "/dev/shm/bee-live"
+const copyProgressLogStep int64 = 100 * 1024 * 1024

 func (s *System) IsLiveMediaInRAM() bool {
 	return s.LiveMediaRAMState().InRAM
@@ -140,26 +141,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 	}

 	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
-	if err != nil || len(squashfsFiles) == 0 {
-		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
-	}
-
-	free := freeMemBytes()
-	var needed int64
-	for _, sf := range squashfsFiles {
-		fi, err2 := os.Stat(sf)
-		if err2 != nil {
-			return fmt.Errorf("stat %s: %v", sf, err2)
-		}
-		needed += fi.Size()
-	}
-	const headroom = 256 * 1024 * 1024
-	if free > 0 && needed+headroom > free {
-		return fmt.Errorf("insufficient RAM: need %s, available %s",
-			humanBytes(needed+headroom), humanBytes(free))
-	}
+	sourceAvailable := err == nil && len(squashfsFiles) > 0

 	dstDir := installToRAMDir
+
+	// If the source medium is unavailable, check whether a previous run already
+	// produced a complete copy in RAM. If so, skip the copy phase and proceed
+	// directly to the loop-rebind / bind-mount steps.
+	if !sourceAvailable {
+		copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
+		if len(copiedFiles) > 0 {
+			log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
+			// Proceed to rebind with the already-copied files.
+			for _, dst := range copiedFiles {
+				base := filepath.Base(dst)
+				// Re-associate the loop device that was originally backed by the
+				// source file (now gone); find it by the old source path pattern.
+				srcGuess := "/run/live/medium/live/" + base
+				loopDev, lerr := findLoopForFile(srcGuess)
+				if lerr != nil {
+					log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
+					continue
+				}
+				if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
+					log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
+				} else {
+					log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
+				}
+			}
+			goto bindMedium
+		}
+		return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
+	}
+
+	{
+		free := freeMemBytes()
+		var needed int64
+		for _, sf := range squashfsFiles {
+			fi, err2 := os.Stat(sf)
+			if err2 != nil {
+				return fmt.Errorf("stat %s: %v", sf, err2)
+			}
+			needed += fi.Size()
+		}
+		const headroom = 256 * 1024 * 1024
+		if free > 0 && needed+headroom > free {
+			return fmt.Errorf("insufficient RAM: need %s, available %s",
+				humanBytes(needed+headroom), humanBytes(free))
+		}
+	}
+
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
@@ -199,6 +230,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 		}
 	}

+bindMedium:
 	log("Copying remaining medium files...")
 	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
 		log(fmt.Sprintf("Warning: partial copy: %v", err))
@@ -288,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	defer out.Close()
 	total := fi.Size()
 	var copied int64
+	var lastLogged int64
 	buf := make([]byte, 4*1024*1024)
 	for {
 		if err := ctx.Err(); err != nil {
@@ -299,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 				return werr
 			}
 			copied += int64(n)
-			if logFunc != nil && total > 0 {
+			if shouldLogCopyProgress(copied, total, lastLogged) {
+				lastLogged = copied
 				pct := int(float64(copied) / float64(total) * 100)
 				logFunc(fmt.Sprintf("  %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
 			}
@@ -314,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	return out.Sync()
 }

+func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
+	if total <= 0 || copied <= 0 {
+		return false
+	}
+	if copied >= total {
+		return copied > lastLogged
+	}
+	if copied < copyProgressLogStep {
+		return false
+	}
+	return copied-lastLogged >= copyProgressLogStep
+}
+
 func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
 	return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
 		if ctx.Err() != nil {
@@ -101,3 +101,26 @@ func TestEvaluateLiveMediaRAMState(t *testing.T) {
 		}
 	})
 }
+
+func TestShouldLogCopyProgress(t *testing.T) {
+	t.Parallel()
+
+	total := int64(250 * 1024 * 1024)
+	step := int64(100 * 1024 * 1024)
+
+	if shouldLogCopyProgress(step-1, total, 0) {
+		t.Fatal("progress logged too early")
+	}
+	if !shouldLogCopyProgress(step, total, 0) {
+		t.Fatal("expected log at first 100MB boundary")
+	}
+	if shouldLogCopyProgress(step+16*1024*1024, total, step) {
+		t.Fatal("progress logged again before next 100MB")
+	}
+	if !shouldLogCopyProgress(2*step, total, step) {
+		t.Fatal("expected log at second 100MB boundary")
+	}
+	if !shouldLogCopyProgress(total, total, 2*step) {
+		t.Fatal("expected final completion log")
+	}
+}
@@ -1,11 +1,14 @@
 package platform

 import (
+	"context"
 	"fmt"
+	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
+	"time"
 )

 // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
@@ -15,6 +18,7 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
+	"nvbandwidth",
 	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
 	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
 	"nvvs",
@@ -30,7 +34,12 @@ type KilledProcess struct {
 // KillTestWorkers scans /proc for running test worker processes and sends
 // SIGKILL to each one found. It returns a list of killed processes.
 // Errors for individual processes (e.g. already exited) are silently ignored.
+// The scan runs under a 5-second deadline to avoid blocking if the process
+// table is very large (e.g. after a stress test with thousands of children).
 func KillTestWorkers() []KilledProcess {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
 	entries, err := os.ReadDir("/proc")
 	if err != nil {
 		return nil
@@ -38,6 +47,13 @@ func KillTestWorkers() []KilledProcess {

 	var killed []KilledProcess
 	for _, e := range entries {
+		select {
+		case <-ctx.Done():
+			slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
+			return killed
+		default:
+		}
+
 		if !e.IsDir() {
 			continue
 		}
@@ -56,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
 		if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
 			base = exe[idx+1:]
 		}
-		for _, pat := range workerPatterns {
-			if strings.Contains(base, pat) || strings.Contains(exe, pat) {
-				_ = syscall.Kill(pid, syscall.SIGKILL)
-				killed = append(killed, KilledProcess{PID: pid, Name: base})
-				break
-			}
+		if shouldKillWorkerProcess(exe, base) {
+			_ = syscall.Kill(pid, syscall.SIGKILL)
+			killed = append(killed, KilledProcess{PID: pid, Name: base})
 		}
 	}
 	return killed
 }
+
+func shouldKillWorkerProcess(exe, base string) bool {
+	for _, pat := range workerPatterns {
+		if strings.Contains(base, pat) || strings.Contains(exe, pat) {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,39 @@
+package platform
+
+import "testing"
+
+func TestShouldKillWorkerProcess(t *testing.T) {
+	tests := []struct {
+		name string
+		exe  string
+		base string
+		want bool
+	}{
+		{
+			name: "nvbandwidth executable",
+			exe:  "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
+			base: "nvbandwidth",
+			want: true,
+		},
+		{
+			name: "dcgmi executable",
+			exe:  "/usr/bin/dcgmi",
+			base: "dcgmi",
+			want: true,
+		},
+		{
+			name: "unrelated process",
+			exe:  "/usr/bin/bash",
+			base: "bash",
+			want: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
+				t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
+			}
+		})
+	}
+}
@@ -1,8 +1,10 @@
 package platform

 import (
+	"bee/audit/internal/collector"
 	"bufio"
 	"encoding/json"
+	"fmt"
 	"os"
 	"os/exec"
 	"sort"
@@ -14,13 +16,24 @@ import (
 // LiveMetricSample is a single point-in-time snapshot of server metrics
 // collected for the web UI metrics page.
 type LiveMetricSample struct {
-	Timestamp  time.Time      `json:"ts"`
-	Fans       []FanReading   `json:"fans"`
-	Temps      []TempReading  `json:"temps"`
-	PowerW     float64        `json:"power_w"`
-	CPULoadPct float64        `json:"cpu_load_pct"`
-	MemLoadPct float64        `json:"mem_load_pct"`
-	GPUs       []GPUMetricRow `json:"gpus"`
+	Timestamp   time.Time      `json:"ts"`
+	Fans        []FanReading   `json:"fans"`
+	Temps       []TempReading  `json:"temps"`
+	PowerW      float64        `json:"power_w"`
+	PowerSource string         `json:"power_source,omitempty"`
+	PowerMode   string         `json:"power_mode,omitempty"`
+	PowerReason string         `json:"power_reason,omitempty"`
+	PSUs        []PSUReading   `json:"psus,omitempty"`
+	CPULoadPct  float64        `json:"cpu_load_pct"`
+	MemLoadPct  float64        `json:"mem_load_pct"`
+	GPUs        []GPUMetricRow `json:"gpus"`
+}
+
+// PSUReading is a per-slot power supply input power reading.
+type PSUReading struct {
+	Slot   int     `json:"slot"`
+	Name   string  `json:"name"`
+	PowerW float64 `json:"power_w"`
 }

 // TempReading is a named temperature sensor value.
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
 		}
 	}

-	// System power — returns 0 if unavailable
-	s.PowerW = sampleSystemPower()
+	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
+	s.PSUs = samplePSUPower()
+
+	// System power: use the global autotune-selected source when configured,
+	// otherwise fall back to the historical heuristic and mark the mode.
+	if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
+		s.PowerW = powerW
+		s.PowerSource = decision.EffectiveSource
+		s.PowerMode = decision.Mode
+		s.PowerReason = decision.Reason
+	}

 	// CPU load — from /proc/stat
 	s.CPULoadPct = sampleCPULoadPct()
@@ -326,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
 	}
 	return chip + " / " + name
 }
+
+// samplePSUPower reads per-PSU input power via IPMI SDR.
+// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
+// vendors where PSU sensors may not carry entity ID "10.N".
+// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
+func samplePSUPower() []PSUReading {
+	out, err := exec.Command("ipmitool", "sdr").Output()
+	if err != nil || len(out) == 0 {
+		return nil
+	}
+	slots := collector.PSUSlotsFromSDR(string(out))
+	if len(slots) == 0 {
+		return nil
+	}
+	// Collect slot keys and sort for stable output.
+	keys := make([]int, 0, len(slots))
+	for k := range slots {
+		n, err := strconv.Atoi(k)
+		if err == nil {
+			keys = append(keys, n)
+		}
+	}
+	sort.Ints(keys)
+	psus := make([]PSUReading, 0, len(keys))
+	for _, k := range keys {
+		entry := slots[strconv.Itoa(k)]
+		// Prefer AC input power; fall back to DC output power.
+		var w float64
+		if entry.InputW != nil && *entry.InputW > 0 {
+			w = *entry.InputW
+		} else if entry.OutputW != nil && *entry.OutputW > 0 {
+			w = *entry.OutputW
+		}
+		if w <= 0 {
+			continue
+		}
+		psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
+	}
+	if len(psus) == 0 {
+		return nil
+	}
+	return psus
+}
@@ -0,0 +1,51 @@
+package platform
+
+import (
+	"fmt"
+	"os/exec"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
+
+func runNvidiaRecover(args ...string) (string, error) {
+	helperArgs := append([]string{nvidiaRecoverHelper}, args...)
+	if _, err := exec.LookPath("systemd-run"); err == nil {
+		unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
+		cmdArgs := []string{
+			"systemd-run",
+			"--quiet",
+			"--pipe",
+			"--wait",
+			"--collect",
+			"--service-type=oneshot",
+			"--unit", unit,
+		}
+		cmdArgs = append(cmdArgs, helperArgs...)
+		raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
+		return string(raw), err
+	}
+	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
+	return string(raw), err
+}
+
+func resetNvidiaGPU(index int) (string, error) {
+	if index < 0 {
+		return "", fmt.Errorf("gpu index must be >= 0")
+	}
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "GPU reset completed.\n"
+	}
+	return out, err
+}
+
+func restartNvidiaDrivers() (string, error) {
+	out, err := runNvidiaRecover("restart-drivers")
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "NVIDIA drivers restarted.\n"
+	}
+	return out, err
+}
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
 	"bee-audit",
 	"bee-web",
 	"bee-sshsetup",
+	"nvidia-dcgm",
+	"nvidia-fabricmanager",
 }

 func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
@@ -20,6 +20,54 @@ import (
 	"time"
 )

+// Estimated wall-clock durations for each SAT/validate test, derived from real
+// production logs in _benchmark/_v8/.
+//
+// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
+// the corresponding Run*Pack function change, re-measure the wall-clock duration
+// from actual task logs and update the matching constant here.
+//
+// Sources:
+//   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
+//   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
+//   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
+const (
+	// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
+	SATEstimatedCPUValidateSec = 65
+	// CPU stress: stress-ng 1800 s (stress mode default).
+	SATEstimatedCPUStressSec = 1800
+
+	// RAM: memtester 256 MB / 1 pass.
+	SATEstimatedMemoryValidateSec = 70
+	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
+	SATEstimatedMemoryStressSec = 140
+
+	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUValidateSec = 85
+	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUStressSec = 450
+
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedStressSec = 350
+	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedPowerSec = 350
+
+	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
+	SATEstimatedNvidiaPulseTestSec = 5000
+
+	// NCCL all_reduce_perf, all GPUs simultaneously.
+	SATEstimatedNvidiaInterconnectSec = 300
+	// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
+	// without a user-configurable time limit; duration is determined by nvbandwidth itself.
+	SATEstimatedNvidiaBandwidthSec = 2700
+)
+
 var (
 	satExecCommand  = exec.Command
 	satLookPath     = exec.LookPath
@@ -356,22 +404,17 @@ func normalizeNvidiaBusID(v string) string {
 }

 func (s *System) ResetNvidiaGPU(index int) (string, error) {
-	if index < 0 {
-		return "", fmt.Errorf("gpu index must be >= 0")
-	}
-	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
-	if len(raw) == 0 && err == nil {
-		raw = []byte("GPU reset completed.\n")
-	}
-	return string(raw), err
+	return resetNvidiaGPU(index)
 }

-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	// detect GPU count
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +423,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }

@@ -393,11 +436,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 		profCmd []string
 		profEnv []string
 	)
-	if staggerSec > 0 && len(selected) > 1 {
+	if len(selected) > 1 {
+		// For multiple GPUs, always spawn one dcgmproftester process per GPU via
+		// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
+		// A single dcgmproftester process without -i only loads GPU 0 regardless
+		// of CUDA_VISIBLE_DEVICES.
+		stagger := staggerSec
+		if stagger < 0 {
+			stagger = 0
+		}
 		profCmd = []string{
 			"bee-dcgmproftester-staggered",
 			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
-			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--stagger-seconds", strconv.Itoa(stagger),
 			"--devices", joinIndexList(selected),
 		}
 	} else {
@@ -426,6 +477,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -443,6 +501,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -460,6 +525,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -552,9 +624,19 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
+	// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
+	// intentionally conservative enough for healthy systems while avoiding the
+	// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
+	timeoutSec := sizeMB*passes*20/100 + 60
+	if timeoutSec < 180 {
+		timeoutSec = 180
+	}
+	if timeoutSec > 900 {
+		timeoutSec = 900
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
-		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
+		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
 		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
 	}, logFunc)
 }
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -42,27 +43,56 @@ type GPUStressMetric struct {

 // FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
 type FanStressRow struct {
-	TimestampUTC string
-	ElapsedSec   float64
-	Phase        string // "baseline", "load1", "pause", "load2", "cooldown"
-	GPUs         []GPUStressMetric
-	Fans         []FanReading
-	CPUMaxTempC  float64 // highest CPU temperature from ipmitool / sensors
-	SysPowerW    float64 // DCMI system power reading
+	TimestampUTC   string
+	ElapsedSec     float64
+	Phase          string // "baseline", "load1", "pause", "load2", "cooldown"
+	GPUs           []GPUStressMetric
+	Fans           []FanReading
+	CPUMaxTempC    float64 // highest CPU temperature from ipmitool / sensors
+	SysPowerW      float64
+	SysPowerSource string
+	SysPowerMode   string
 }

 type cachedPowerReading struct {
 	Value     float64
+	Source    string
+	Mode      string
+	Reason    string
 	UpdatedAt time.Time
 }

+type fanObservationState struct {
+	MaxRPM map[string]float64 `json:"max_rpm"`
+}
+
+type fanPeakCandidate struct {
+	FirstSeen time.Time
+	RPM       float64
+}
+
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
+	fanObservationMu   sync.Mutex
+	fanObservation     fanObservationState
+	fanObservationInit bool
+	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )

 const systemPowerHoldTTL = 15 * time.Second

+var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
+
+const fanObservationMinPeakHold = time.Second
+
+func normalizeObservedFanMaxRPM(rpm float64) float64 {
+	if rpm <= 0 {
+		return 0
+	}
+	return math.Ceil(rpm/1000.0) * 1000.0
+}
+
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -253,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
 	row.GPUs = sampleGPUStressMetrics(gpuIndices)
 	row.Fans, _ = sampleFanSpeeds()
 	row.CPUMaxTempC = sampleCPUMaxTemp()
-	row.SysPowerW = sampleSystemPower()
+	row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
 	return row
 }

@@ -310,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
+			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
+		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }

+func loadFanObservationLocked() {
+	if fanObservationInit {
+		return
+	}
+	fanObservationInit = true
+	fanObservation.MaxRPM = make(map[string]float64)
+	raw, err := os.ReadFile(fanObservationStatePath)
+	if err != nil || len(raw) == 0 {
+		return
+	}
+	var persisted fanObservationState
+	if json.Unmarshal(raw, &persisted) != nil {
+		return
+	}
+	for name, rpm := range persisted.MaxRPM {
+		name = strings.TrimSpace(name)
+		if name == "" || rpm <= 0 {
+			continue
+		}
+		fanObservation.MaxRPM[name] = rpm
+	}
+}
+
+func saveFanObservationLocked() {
+	if len(fanObservation.MaxRPM) == 0 {
+		return
+	}
+	dir := filepath.Dir(fanObservationStatePath)
+	if dir == "" || dir == "." {
+		dir = "/var/log/bee-sat"
+	}
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return
+	}
+	raw, err := json.MarshalIndent(fanObservation, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
+}
+
+func updateFanObservation(fans []FanReading, now time.Time) {
+	if len(fans) == 0 {
+		return
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	changed := false
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		currentMax := fanObservation.MaxRPM[name]
+		if fan.RPM <= currentMax {
+			delete(fanPeakCandidates, name)
+			continue
+		}
+		if cand, ok := fanPeakCandidates[name]; ok {
+			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
+				newMax := math.Max(cand.RPM, fan.RPM)
+				if newMax > currentMax {
+					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
+					changed = true
+				}
+				delete(fanPeakCandidates, name)
+				continue
+			}
+			if fan.RPM > cand.RPM {
+				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
+			}
+			continue
+		}
+		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
+	}
+	if changed {
+		saveFanObservationLocked()
+	}
+}
+
+func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
+	if len(fans) == 0 {
+		return 0, false
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	var samples []float64
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		maxRPM := fanObservation.MaxRPM[name]
+		if maxRPM <= 0 {
+			continue
+		}
+		pct := fan.RPM / maxRPM * 100.0
+		if pct > 100 {
+			pct = 100
+		}
+		if pct < 0 {
+			pct = 0
+		}
+		samples = append(samples, pct)
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -428,12 +573,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {

 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
-func sampleFanDutyCyclePct() (float64, bool) {
+func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
-		return 0, false
+		fans, fanErr := sampleFanSpeeds()
+		if fanErr != nil {
+			return 0, false, false
+		}
+		return sampleFanDutyCyclePctFromFans(fans)
 	}
-	return parseFanDutyCyclePctSensorsJSON(out)
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
+	return pct, ok, false
+}
+
+func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
+	if len(fans) == 0 {
+		return 0, false, false
+	}
+	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
+		return pct, true, true
+	}
+	return 0, false, false
 }

 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
@@ -608,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
 	return max
 }

-// sampleSystemPower reads system power draw via DCMI.
-func sampleSystemPower() float64 {
+// sampleSystemPowerResolved reads system power via the global autotune source,
+// falling back to the historical heuristic before autotune or when degraded.
+func sampleSystemPowerResolved() (float64, string, string) {
 	now := time.Now()
-	current := 0.0
-	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
-	if err == nil {
-		current = parseDCMIPowerReading(string(out))
-	}
+	current, decision, err := SampleSystemPowerResolved("")
 	systemPowerCacheMu.Lock()
 	defer systemPowerCacheMu.Unlock()
-	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	if err != nil {
+		current = 0
+	}
+	value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
 	systemPowerCache = updated
-	return value
+	return value, updated.Source, updated.Mode
 }

 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -643,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }

-func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
 	if current > 0 {
-		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
 		return current, cache
 	}
 	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
@@ -1,6 +1,7 @@
 package platform

 import (
+	"path/filepath"
 	"testing"
 	"time"
 )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	}
 }

+func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
+	t.Parallel()
+
+	oldPath := fanObservationStatePath
+	oldState := fanObservation
+	oldInit := fanObservationInit
+	oldCandidates := fanPeakCandidates
+	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	t.Cleanup(func() {
+		fanObservationStatePath = oldPath
+		fanObservation = oldState
+		fanObservationInit = oldInit
+		fanPeakCandidates = oldCandidates
+	})
+
+	start := time.Unix(100, 0)
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
+	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
+		t.Fatalf("single-sample spike should not establish observed max")
+	}
+
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
+
+	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected estimated duty cycle from persisted observed max")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("got=%v want ~43.3", got)
+	}
+
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected persisted observed max to be reloaded from disk")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("reloaded got=%v want ~43.3", got)
+	}
+}
+
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
@@ -64,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	now := time.Now()
 	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}

-	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
 	if got != 480 {
 		t.Fatalf("got=%v want cached 480", got)
 	}
@@ -72,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 		t.Fatalf("updated=%+v", updated)
 	}

-	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
 	if got != 530 {
 		t.Fatalf("got=%v want 530", got)
 	}
@@ -81,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	}

 	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
-	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
 	if got != 0 {
 		t.Fatalf("expired cache returned %v want 0", got)
 	}
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
+	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
+	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
+	if len(cmd) != len(want) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
+	if name == "bee-nvidia" && action == ServiceRestart {
+		return restartNvidiaDrivers()
+	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
@@ -211,6 +211,7 @@ type HardwarePCIeDevice struct {
 	Firmware               *string        `json:"firmware,omitempty"`
 	MacAddresses           []string       `json:"mac_addresses,omitempty"`
 	Present                *bool          `json:"present,omitempty"`
+	IOMMUGroup             *int           `json:"iommu_group,omitempty"`
 	Telemetry              map[string]any `json:"-"`
 }

@@ -44,3 +44,48 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
 		t.Fatalf("missing event_logs payload: %s", text)
 	}
 }
+
+func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
+	powerOnHours := int64(12450)
+	writtenBytes := int64(9876543210)
+	readBytes := int64(1234567890)
+	lifeRemainingPct := 91.0
+
+	payload := HardwareIngestRequest{
+		CollectedAt: "2026-03-15T15:00:00Z",
+		Hardware: HardwareSnapshot{
+			Board: HardwareBoard{SerialNumber: "SRV-001"},
+			Storage: []HardwareStorage{
+				{
+					SerialNumber:     stringPtr("DISK-001"),
+					Model:            stringPtr("TestDisk"),
+					PowerOnHours:     &powerOnHours,
+					WrittenBytes:     &writtenBytes,
+					ReadBytes:        &readBytes,
+					LifeRemainingPct: &lifeRemainingPct,
+				},
+			},
+		},
+	}
+
+	data, err := json.Marshal(payload)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	text := string(data)
+	for _, needle := range []string{
+		`"storage":[{`,
+		`"power_on_hours":12450`,
+		`"written_bytes":9876543210`,
+		`"read_bytes":1234567890`,
+		`"life_remaining_pct":91`,
+	} {
+		if !strings.Contains(text, needle) {
+			t.Fatalf("missing %q in payload: %s", needle, text)
+		}
+	}
+}
+
+func stringPtr(v string) *string {
+	return &v
+}
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstallToRAM
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-bench-perf", "nvidia-bench-power":
+	case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 		}

 		if rampUp && len(body.GPUIndices) > 1 {
-			// Ramp-up mode: resolve GPU list, then create one task per prefix
-			// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
+			// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
+			// in Phase 2 (one additional GPU per step). A single task with all
+			// selected GPUs is sufficient — spawning N tasks with growing subsets
+			// would repeat all earlier steps redundantly.
 			gpus, err := apiListNvidiaGPUs(h.opts.App)
 			if err != nil {
 				writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 			} else {
 				now := time.Now()
 				rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
-				var allTasks []*Task
-				for step := 1; step <= len(resolved); step++ {
-					subset := resolved[:step]
-					stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
-					t := &Task{
-						ID:        newJobID("bee-bench-nvidia"),
-						Name:      stepName,
-						Target:    target,
-						Priority:  defaultTaskPriority(target, taskParams{}),
-						Status:    TaskPending,
-						CreatedAt: now,
-						params: taskParams{
-							GPUIndices:       append([]int(nil), subset...),
-							SizeMB:           body.SizeMB,
-							BenchmarkProfile: body.Profile,
-							RunNCCL:          runNCCL && step == len(resolved),
-							ParallelGPUs:     true,
-							RampStep:         step,
-							RampTotal:        len(resolved),
-							RampRunID:        rampRunID,
-							DisplayName:      stepName,
-						},
-					}
-					allTasks = append(allTasks, t)
+				taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
+				t := &Task{
+					ID:        newJobID("bee-bench-nvidia"),
+					Name:      taskName,
+					Target:    target,
+					Priority:  defaultTaskPriority(target, taskParams{}),
+					Status:    TaskPending,
+					CreatedAt: now,
+					params: taskParams{
+						GPUIndices:       append([]int(nil), resolved...),
+						SizeMB:           body.SizeMB,
+						BenchmarkProfile: body.Profile,
+						RunNCCL:          runNCCL,
+						ParallelGPUs:     true,
+						RampTotal:        len(resolved),
+						RampRunID:        rampRunID,
+						DisplayName:      taskName,
+					},
 				}
-				for _, t := range allTasks {
-					globalQueue.enqueue(t)
-				}
-				writeTaskRunResponse(w, allTasks)
+				globalQueue.enqueue(t)
+				writeTaskRunResponse(w, []*Task{t})
 				return
 			}
 		}
@@ -707,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 	}
 }

+func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if h.opts.App == nil {
+			writeError(w, http.StatusServiceUnavailable, "app not configured")
+			return
+		}
+		var body struct {
+			Profile       string `json:"profile"`
+			BenchmarkKind string `json:"benchmark_kind"`
+			SizeMB        int    `json:"size_mb"`
+		}
+		if r.Body != nil {
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+				writeError(w, http.StatusBadRequest, "invalid request body")
+				return
+			}
+		}
+		profile := strings.TrimSpace(body.Profile)
+		if profile == "" {
+			profile = "standard"
+		}
+		benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
+		if benchmarkKind == "" {
+			benchmarkKind = "power-fit"
+		}
+		now := time.Now()
+		taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
+		t := &Task{
+			ID:        newJobID("bee-bench-autotune"),
+			Name:      taskName,
+			Target:    "nvidia-bench-autotune",
+			Priority:  defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
+			Status:    TaskPending,
+			CreatedAt: now,
+			params: taskParams{
+				BenchmarkProfile: profile,
+				BenchmarkKind:    benchmarkKind,
+				SizeMB:           body.SizeMB,
+				DisplayName:      taskName,
+			},
+		}
+		globalQueue.enqueue(t)
+		writeTaskRunResponse(w, []*Task{t})
+	}
+}
+
+func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
+	if err != nil {
+		if os.IsNotExist(err) {
+			w.WriteHeader(http.StatusOK)
+			writeJSON(w, map[string]any{
+				"configured": false,
+				"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+			})
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	w.WriteHeader(http.StatusOK)
+	writeJSON(w, map[string]any{
+		"configured": true,
+		"config":     cfg,
+		"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+	})
+}
+
 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }
@@ -740,12 +806,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
 			now := time.Now()
 			t.DoneAt = &now
 		case TaskRunning:
-			if t.job != nil {
-				t.job.abort()
+			if t.job == nil || !t.job.abort() {
+				globalQueue.mu.Unlock()
+				writeJSON(w, map[string]string{"status": "not_running"})
+				return
 			}
-			t.Status = TaskCancelled
-			now := time.Now()
-			t.DoneAt = &now
+			globalQueue.mu.Unlock()
+			writeJSON(w, map[string]string{"status": "aborting"})
+			return
 		}
 		globalQueue.mu.Unlock()
 		writeJSON(w, map[string]string{"status": "aborted"})
@@ -970,6 +1038,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
 	writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
 }

+func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
+	state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	if state.Targets == nil {
+		state.Targets = []app.BlackboxTargetStatus{}
+	}
+	writeJSON(w, state)
+}
+
+func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	var target platform.RemovableTarget
+	if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
+		writeError(w, http.StatusBadRequest, "device is required")
+		return
+	}
+	targets, err := h.opts.App.ListRemovableTargets()
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	allowed := false
+	for _, candidate := range targets {
+		if candidate.Device == target.Device {
+			target = candidate
+			allowed = true
+			break
+		}
+	}
+	if !allowed {
+		writeError(w, http.StatusBadRequest, "device not in removable target list")
+		return
+	}
+	marker, err := app.EnableBlackboxTarget(target)
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	writeJSON(w, map[string]any{
+		"status":        "ok",
+		"message":       "Black-box marker written.",
+		"enrollment_id": marker.EnrollmentID,
+	})
+}
+
+func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		Device       string `json:"device"`
+		EnrollmentID string `json:"enrollment_id"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			writeError(w, http.StatusNotFound, "black-box target not found")
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
+}
+
 // ── GPU presence ──────────────────────────────────────────────────────────────

 func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
@@ -1529,6 +1672,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
 	writeJSON(w, map[string]string{"status": "rolled back"})
 }

+func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
+}
+
 func (h *handler) rollbackPendingNetworkChange() error {
 	h.pendingNetMu.Lock()
 	pnc := h.pendingNet
@@ -3,6 +3,8 @@ package webui
 import (
 	"encoding/json"
 	"net/http/httptest"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"

@@ -44,6 +46,45 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	}
 }

+func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
+	h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
+
+	h.handleAPIBlackboxStatus(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var state app.BlackboxState
+	if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
+		t.Fatalf("decode state: %v", err)
+	}
+	if state.Status != "disabled" {
+		t.Fatalf("status=%q want disabled", state.Status)
+	}
+}
+
+func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
+	exportDir := t.TempDir()
+	statePath := filepath.Join(exportDir, "blackbox-state.json")
+	if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
+		t.Fatalf("write state: %v", err)
+	}
+	h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
+
+	h.handleAPIBlackboxStatus(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
+		t.Fatalf("body=%s", rec.Body.String())
+	}
+}
+
 func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
@@ -178,16 +219,54 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
-	if len(globalQueue.tasks) != 3 {
-		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
+	// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
+	// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
 	}
-	for i, task := range globalQueue.tasks {
-		if task.Target != "nvidia-bench-power" {
-			t.Fatalf("task[%d] target=%q", i, task.Target)
-		}
-		if task.Priority != taskPriorityBenchmark {
-			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
-		}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-power" {
+		t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
+	}
+	if task.Priority != taskPriorityBenchmark {
+		t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
+	}
+	if task.params.RampTotal != 3 {
+		t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
+	}
+}
+
+func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-autotune" {
+		t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
+	}
+	if task.params.BenchmarkKind != "power-fit" {
+		t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
 	}
 }

@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
 	return out
 }

+// renderStackedMetricChartSVG renders a stacked area chart where each dataset
+// is visually "stacked" on top of the previous one. Intended for multi-PSU
+// power charts where the filled area of each PSU shows its individual
+// contribution and the total height equals the combined draw.
+func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range datasets {
+		if len(datasets[i]) == 0 {
+			datasets[i] = make([]float64, pointCount)
+		}
+	}
+
+	times, datasets = downsampleTimeSeries(times, datasets, 1400)
+	pointCount = len(times)
+
+	// Build cumulative sums per time point.
+	cumulative := make([][]float64, len(datasets)+1)
+	for i := range cumulative {
+		cumulative[i] = make([]float64, pointCount)
+	}
+	for i, ds := range datasets {
+		for j, v := range ds {
+			cumulative[i+1][j] = cumulative[i][j] + v
+		}
+	}
+
+	// Scale is based on the total (top cumulative row).
+	total := cumulative[len(cumulative)-1]
+	yMin := floatPtr(0)
+	if yMax == nil {
+		yMax = autoMax120(total)
+	}
+	scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
+
+	legendItems := make([]metricChartSeries, len(datasets))
+	for i, name := range names {
+		color := metricChartPalette[i%len(metricChartPalette)]
+		legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
+	}
+
+	// Stats label from totals.
+	statsLabel := chartStatsLabel([][]float64{total})
+
+	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, layout.Width, layout.Height)
+	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scale)
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+	writeSingleAxisY(&b, layout, scale)
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+
+	// Draw stacked areas from top to bottom so lower layers are visible.
+	for i := len(datasets) - 1; i >= 0; i-- {
+		writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
+	}
+	// Draw border polylines on top.
+	for i := len(datasets) - 1; i >= 0; i-- {
+		writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
+	}
+
+	writeLegend(&b, layout, legendItems)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+// writeStackedArea draws a filled polygon between two cumulative value arrays
+// (baseline and top), using the given color at 55% opacity.
+func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
+	n := len(top)
+	if n == 0 {
+		return
+	}
+	if len(baseline) < n {
+		baseline = make([]float64, n)
+	}
+
+	// Forward path along top values, then backward along baseline values.
+	var points strings.Builder
+	for i := 0; i < n; i++ {
+		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
+		if i > 0 {
+			points.WriteByte(' ')
+		}
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	for i := n - 1; i >= 0; i-- {
+		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
+		points.WriteByte(' ')
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
+}
+
 func writeSVGOpen(b *strings.Builder, width, height int) {
 	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
 }
@@ -1,6 +1,9 @@
 package webui

 import (
+	"bufio"
+	"fmt"
+	"io"
 	"os"
 	"strings"
 	"sync"
@@ -17,6 +20,25 @@ type jobState struct {
 	cancel       func() // optional cancel function; nil if job is not cancellable
 	logPath      string
 	serialPrefix string
+	logFile      *os.File // kept open for the task lifetime to avoid per-line open/close
+	logBuf       *bufio.Writer
+}
+
+// readTaskLogFile reads a task log, refusing files over 50 MB.
+func readTaskLogFile(path string) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > 50<<20 {
+		return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
+	}
+	return data, nil
 }

 // abort cancels the job if it has a cancel function and is not yet done.
@@ -31,13 +53,21 @@ func (j *jobState) abort() bool {
 }

 func (j *jobState) append(line string) {
+	j.appendWithOptions(line, true, true)
+}
+
+func (j *jobState) appendFromLog(line string) {
+	j.appendWithOptions(line, false, false)
+}
+
+func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
-	if j.logPath != "" {
-		appendJobLog(j.logPath, line)
+	if persistLog && j.logPath != "" {
+		j.writeLogLineLocked(line)
 	}
-	if j.serialPrefix != "" {
+	if serialMirror && j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
 	}
 	for _, ch := range j.subs {
@@ -48,6 +78,35 @@ func (j *jobState) append(line string) {
 	}
 }

+// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
+// Must be called with j.mu held. Uses a buffered writer kept open for the task
+// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
+func (j *jobState) writeLogLineLocked(line string) {
+	if j.logFile == nil {
+		f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
+		if err != nil {
+			return
+		}
+		j.logFile = f
+		j.logBuf = bufio.NewWriterSize(f, 64*1024)
+	}
+	_, _ = j.logBuf.WriteString(line + "\n")
+}
+
+// closeLog flushes and closes the log file. Called after all task output is done.
+func (j *jobState) closeLog() {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	if j.logBuf != nil {
+		_ = j.logBuf.Flush()
+	}
+	if j.logFile != nil {
+		_ = j.logFile.Close()
+		j.logFile = nil
+		j.logBuf = nil
+	}
+}
+
 func (j *jobState) finish(errMsg string) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
@@ -119,7 +178,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	if logPath == "" {
 		return j
 	}
-	data, err := os.ReadFile(logPath)
+	data, err := readTaskLogFile(logPath)
 	if err != nil || len(data) == 0 {
 		return j
 	}
@@ -0,0 +1,137 @@
+package webui
+
+import (
+	"fmt"
+	"html"
+	"os"
+	"strings"
+)
+
+func layoutHead(title string) string {
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>` + html.EscapeString(title) + `</title>
+<style>
+:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
+*{box-sizing:border-box;margin:0;padding:0}
+body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
+a{color:var(--accent);text-decoration:none}
+/* Sidebar */
+.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
+.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
+.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
+.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
+.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
+.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
+.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
+.nav{flex:1}
+.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
+.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
+.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
+/* Content */
+.main{flex:1;display:flex;flex-direction:column;overflow:auto}
+.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
+.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
+.content{padding:24px;flex:1}
+/* Cards */
+.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
+.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
+.card-head-actions{justify-content:space-between}
+.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
+.card-body{padding:16px}
+/* Buttons */
+.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
+.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
+.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
+.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
+.btn-sm{padding:5px 10px;font-size:12px}
+/* Tables */
+table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
+th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
+td{padding:9px 14px;border-top:1px solid var(--border-lite)}
+tr:first-child td{border-top:0}
+tbody tr:hover td{background:rgba(0,0,0,.03)}
+/* Status badges */
+.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
+.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Component chips — one small square per device */
+.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
+.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
+.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Output terminal */
+.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
+.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
+/* Forms */
+.form-row{margin-bottom:14px}
+.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
+.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
+.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
+/* Grid */
+.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
+.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
+@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
+/* iframe viewer */
+.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
+/* Alerts */
+.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
+.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
+.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
+</style>
+</head>
+<body>
+`
+}
+
+func layoutNav(active string, buildLabel string) string {
+	items := []struct{ id, label, href, onclick string }{
+		{"dashboard", "Dashboard", "/", ""},
+		{"audit", "Audit", "/audit", ""},
+		{"validate", "Validate", "/validate", ""},
+		{"burn", "Burn", "/burn", ""},
+		{"benchmark", "Benchmark", "/benchmark", ""},
+		{"tasks", "Tasks", "/tasks", ""},
+		{"tools", "Tools", "/tools", ""},
+	}
+	var b strings.Builder
+	b.WriteString(`<aside class="sidebar">`)
+	b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
+	if strings.TrimSpace(buildLabel) == "" {
+		buildLabel = "dev"
+	}
+	b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
+	if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+		gspMode := strings.TrimSpace(string(raw))
+		switch gspMode {
+		case "gsp-off":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
+		case "gsp-stuck":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
+		}
+	}
+	b.WriteString(`<nav class="nav">`)
+	for _, item := range items {
+		cls := "nav-item"
+		if item.id == active {
+			cls += " active"
+		}
+		if item.onclick != "" {
+			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
+				cls, item.href, item.onclick, item.label))
+		} else {
+			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
+				cls, item.href, item.label))
+		}
+	}
+	b.WriteString(`</nav>`)
+	b.WriteString(`</aside>`)
+	return b.String()
+}
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
+  power_source TEXT,
+  power_mode   TEXT,
+  power_reason TEXT,
  PRIMARY KEY (ts)
 );
 CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
-	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+	if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
 }

 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	defer func() { _ = tx.Rollback() }()

 	_, err = tx.Exec(
-		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
-		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
 	)
 	if err != nil {
 		return err
@@ -161,14 +173,64 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	return tx.Commit()
 }

+// Downsample reduces density of old metrics rows to 1 sample per minute.
+// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
+// affected — rows newer than downsampleBefore keep full 5-second resolution.
+// For each 60-second bucket the row with the smallest ts is kept; the rest
+// are deleted. This trims ~92 % of rows in that window while preserving
+// the overall shape of every chart.
+//
+// Called hourly by the metrics collector background goroutine.
+func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	start := deleteOlderThan.Unix()
+	end := downsampleBefore.Unix()
+	if end <= start {
+		return nil
+	}
+	// For each table: delete rows in [start, end) whose ts is NOT the minimum
+	// ts in its 60-second bucket (ts/60 integer division = bucket ID).
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		_, err := m.db.Exec(`
+DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
+  AND ts NOT IN (
+    SELECT MIN(ts) FROM `+table+`
+    WHERE ts >= ? AND ts < ?
+    GROUP BY ts / 60
+  )`, start, end, start, end)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Prune deletes all rows older than the given cutoff from every metrics table.
+// Called hourly by the metrics collector to keep the DB size bounded.
+func (m *MetricsDB) Prune(before time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	cutTS := before.Unix()
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
+			return err
+		}
+	}
+	_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
+	return nil
+}
+
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
 }

 // LoadAll returns all persisted samples in chronological order (oldest first).
 func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
 }

 // LoadBetween returns samples in chronological order within the given time window.
@@ -183,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
 		start, end = end, start
 	}
 	return m.loadSamples(
-		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
 		start.Unix(), end.Unix(),
 	)
 }
@@ -199,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	type sysRow struct {
 		ts            int64
 		cpu, mem, pwr float64
+		powerSource   string
+		powerMode     string
+		powerReason   string
 	}
 	var sysRows []sysRow
 	for rows.Next() {
 		var r sysRow
-		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
 			continue
 		}
 		sysRows = append(sysRows, r)
@@ -313,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	samples := make([]platform.LiveMetricSample, len(sysRows))
 	for i, r := range sysRows {
 		s := platform.LiveMetricSample{
-			Timestamp:  time.Unix(r.ts, 0).UTC(),
-			CPULoadPct: r.cpu,
-			MemLoadPct: r.mem,
-			PowerW:     r.pwr,
+			Timestamp:   time.Unix(r.ts, 0).UTC(),
+			CPULoadPct:  r.cpu,
+			MemLoadPct:  r.mem,
+			PowerW:      r.pwr,
+			PowerSource: r.powerSource,
+			PowerMode:   r.powerMode,
+			PowerReason: r.powerReason,
 		}
 		for _, idx := range gpuIndices {
 			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
@@ -0,0 +1,613 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+type benchmarkHistoryRun struct {
+	generatedAt   time.Time
+	displayTime   string
+	gpuScores     map[int]float64
+	gpuStatuses   map[int]string
+	overallStatus string
+}
+
+func renderBenchmark(opts HandlerOptions) string {
+	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+<div class="grid2">
+  <div class="card">
+    <div class="card-head">Benchmark Setup</div>
+    <div class="card-body">
+      <div class="form-row">
+        <label>Profile</label>
+        <select id="benchmark-profile">
+          <option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
+          <option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
+          <option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
+        </select>
+      </div>
+      <div class="form-row">
+        <label>GPU Selection</label>
+        <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
+        </div>
+        <div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+        </div>
+      </div>
+      <label class="benchmark-cb-row">
+        <input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
+        <span>Sequential — one GPU at a time</span>
+      </label>
+      <label class="benchmark-cb-row" id="benchmark-parallel-label">
+        <input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
+        <span>Parallel — all selected GPUs simultaneously</span>
+      </label>
+      <label class="benchmark-cb-row" id="benchmark-ramp-label">
+        <input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
+        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
+      </label>
+      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
+      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
+        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
+        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
+        <button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
+      </div>
+      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
+      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
+      <div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
+      <div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
+    </div>
+  </div>
+
+  <div class="card">
+    <div class="card-head">Method Split</div>
+    <div class="card-body">
+      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
+      <table>
+        <tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
+        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
+        <tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
+      </table>
+      <p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
+    </div>
+  </div>
+</div>
+
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
+
+<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
+  <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
+</div>
+
+<style>
+.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
+.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+</style>
+
+<script>
+let benchmarkES = null;
+function benchmarkTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+function benchmarkSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function benchmarkMode() {
+  const el = document.querySelector('input[name="benchmark-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+function benchmarkUpdateSelectionNote() {
+  const selected = benchmarkSelectedGPUIndices();
+  const perfBtn = document.getElementById('benchmark-run-performance-btn');
+  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
+  const note = document.getElementById('benchmark-selection-note');
+  if (!selected.length) {
+    perfBtn.disabled = true;
+    fitBtn.disabled = true;
+    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
+    return;
+  }
+  perfBtn.disabled = false;
+  fitBtn.disabled = false;
+  const mode = benchmarkMode();
+  if (mode === 'ramp-up') {
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
+  } else if (mode === 'parallel') {
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
+  } else {
+    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
+  }
+}
+function benchmarkRenderGPUList(gpus) {
+  const root = document.getElementById('benchmark-gpu-list');
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    benchmarkUpdateSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="benchmark-gpu-row">'
+      + '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  benchmarkApplyMultiGPUState(gpus.length);
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="benchmark-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkLoadGPUs() {
+  const status = document.getElementById('benchmark-run-status');
+  status.textContent = '';
+  fetch('/api/gpu/nvidia').then(function(r) {
+    return r.json().then(function(body) {
+      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+      return body;
+    });
+  }).then(function(gpus) {
+    benchmarkRenderGPUList(gpus);
+  }).catch(function(err) {
+    document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    benchmarkUpdateSelectionNote();
+  });
+}
+function benchmarkSelectAll() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkSelectNone() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
+  benchmarkUpdateSelectionNote();
+}
+function runNvidiaBenchmark(kind) {
+  const selected = benchmarkSelectedGPUIndices();
+  const status = document.getElementById('benchmark-run-status');
+  if (!selected.length) {
+    status.textContent = 'Select at least one GPU.';
+    return;
+  }
+  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  const mode = benchmarkMode();
+  const rampUp = mode === 'ramp-up' && selected.length > 1;
+  const parallelGPUs = mode === 'parallel' && kind === 'performance';
+  if (kind === 'power-fit' && mode === 'parallel') {
+    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
+    return;
+  }
+  const body = {
+    profile: document.getElementById('benchmark-profile').value || 'standard',
+    gpu_indices: selected,
+    run_nccl: kind === 'performance' && selected.length > 1,
+    parallel_gpus: parallelGPUs,
+    ramp_up: rampUp,
+    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
+  };
+  document.getElementById('benchmark-output').style.display = 'block';
+  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
+  const term = document.getElementById('benchmark-terminal');
+  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
+  status.textContent = 'Queueing...';
+  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
+  fetch(endpoint, {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(body)
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  }).then(function(d) {
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No benchmark task was queued.');
+    status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
+    const streamNext = function(idx, failures) {
+      if (idx >= taskIds.length) {
+        status.textContent = failures ? 'Completed with failures.' : 'Completed.';
+        return;
+      }
+      const taskId = taskIds[idx];
+      term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
+      benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+      benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      benchmarkES.addEventListener('done', function(e) {
+        benchmarkES.close();
+        benchmarkES = null;
+        if (e.data) failures += 1;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        const isLast = (idx + 1 >= taskIds.length);
+        streamNext(idx + 1, failures);
+        if (isLast) { benchmarkRefreshResults(); }
+      });
+      benchmarkES.onerror = function() {
+        if (benchmarkES) {
+          benchmarkES.close();
+          benchmarkES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        streamNext(idx + 1, failures + 1);
+      };
+    };
+    streamNext(0, 0);
+  }).catch(function(err) {
+    status.textContent = 'Error.';
+    term.textContent += 'ERROR: ' + err.message + '\n';
+  });
+}
+function benchmarkRenderAutotuneStatus(payload) {
+  const el = document.getElementById('benchmark-autotune-status');
+  if (!el) return;
+  if (!payload || !payload.configured || !payload.config) {
+    el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
+    return;
+  }
+  const cfg = payload.config || {};
+  const decision = payload.decision || {};
+  const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
+  const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
+  const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
+  const mode = decision.mode ? (' · mode ' + decision.mode) : '';
+  el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
+}
+function loadBenchmarkAutotuneStatus() {
+  fetch('/api/bee-bench/nvidia/autotune/status')
+    .then(function(r) {
+      return r.json().then(function(body) {
+        if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+        return body;
+      });
+    })
+    .then(function(body) { benchmarkRenderAutotuneStatus(body); })
+    .catch(function(err) {
+      const el = document.getElementById('benchmark-autotune-status');
+      if (el) el.textContent = 'Autotune status error: ' + err.message;
+    });
+}
+function runBenchmarkAutotune() {
+  const selected = benchmarkSelectedGPUIndices();
+  const status = document.getElementById('benchmark-run-status');
+  const term = document.getElementById('benchmark-terminal');
+  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  document.getElementById('benchmark-output').style.display = 'block';
+  document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
+  term.textContent = 'Enqueuing benchmark autotune...\n';
+  status.textContent = 'Queueing autotune...';
+  fetch('/api/bee-bench/nvidia/autotune/run', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify({
+      profile: document.getElementById('benchmark-profile').value || 'standard',
+      benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
+      gpu_indices: selected
+    })
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  }).then(function(d) {
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No autotune task was queued.');
+    const taskId = taskIds[0];
+    status.textContent = 'Autotune queued: ' + taskId;
+    benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    benchmarkES.addEventListener('done', function(e) {
+      if (benchmarkES) {
+        benchmarkES.close();
+        benchmarkES = null;
+      }
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
+      loadBenchmarkAutotuneStatus();
+    });
+  }).catch(function(err) {
+    status.textContent = 'Autotune error.';
+    term.textContent += 'ERROR: ' + err.message + '\n';
+  });
+}
+benchmarkLoadGPUs();
+loadBenchmarkAutotuneStatus();
+function benchmarkRefreshResults() {
+  fetch('/api/benchmark/results')
+    .then(function(r) { return r.text(); })
+    .then(function(html) {
+      const el = document.getElementById('benchmark-results-section');
+      if (el) el.innerHTML = html;
+    })
+    .catch(function() {});
+}
+</script>`
+}
+
+func renderBenchmarkResultsCard(exportDir string) string {
+	maxIdx, runs := loadBenchmarkHistory(exportDir)
+	perf := renderBenchmarkResultsCardFromRuns(
+		"Perf Results",
+		"Composite score by saved benchmark run and GPU.",
+		"No saved performance benchmark runs yet.",
+		maxIdx,
+		runs,
+	)
+	power := renderPowerBenchmarkResultsCard(exportDir)
+	return perf + "\n" + power
+}
+
+func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
+	if len(runs) == 0 {
+		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
+	}
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
+	if strings.TrimSpace(description) != "" {
+		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
+	}
+	b.WriteString(`<div style="overflow-x:auto">`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
+	for i := 0; i <= maxGPUIndex; i++ {
+		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
+	}
+	b.WriteString(`</tr></thead><tbody>`)
+	for i, run := range runs {
+		b.WriteString(`<tr>`)
+		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+		overallColor := "var(--ok)"
+		overallLabel := run.overallStatus
+		if overallLabel == "" {
+			overallLabel = "OK"
+		}
+		if overallLabel == "FAILED" {
+			overallColor = "var(--crit-fg,#9f3a38)"
+		} else if overallLabel != "OK" {
+			overallColor = "var(--warn)"
+		}
+		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
+		for idx := 0; idx <= maxGPUIndex; idx++ {
+			score, ok := run.gpuScores[idx]
+			if !ok {
+				b.WriteString(`<td style="color:var(--muted)">-</td>`)
+				continue
+			}
+			gpuStatus := run.gpuStatuses[idx]
+			scoreColor := ""
+			switch gpuStatus {
+			case "FAILED":
+				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
+			case "WARNING", "PARTIAL":
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			case "", "OK":
+			default:
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			}
+			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
+		}
+		b.WriteString(`</tr>`)
+	}
+	b.WriteString(`</tbody></table></div></div></div>`)
+	return b.String()
+}
+
+func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
+	baseDir := app.DefaultBeeBenchPerfDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return -1, nil
+	}
+	sort.Strings(paths)
+	return loadBenchmarkHistoryFromPaths(paths)
+}
+
+func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
+	runs := make([]benchmarkHistoryRun, 0, len(paths))
+	maxGPUIndex := -1
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var result platform.NvidiaBenchmarkResult
+		if err := json.Unmarshal(raw, &result); err != nil {
+			continue
+		}
+		run := benchmarkHistoryRun{
+			generatedAt:   result.GeneratedAt,
+			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			gpuScores:     make(map[int]float64),
+			gpuStatuses:   make(map[int]string),
+			overallStatus: result.OverallStatus,
+		}
+		for _, gpu := range result.GPUs {
+			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
+			run.gpuStatuses[gpu.Index] = gpu.Status
+			if gpu.Index > maxGPUIndex {
+				maxGPUIndex = gpu.Index
+			}
+		}
+		runs = append(runs, run)
+	}
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+	return maxGPUIndex, runs
+}
+
+func renderPowerBenchmarkResultsCard(exportDir string) string {
+	baseDir := app.DefaultBeeBenchPowerDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "power")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
+	}
+	sort.Strings(paths)
+
+	type powerRun struct {
+		generatedAt time.Time
+		displayTime string
+		result      platform.NvidiaPowerBenchResult
+	}
+	var runs []powerRun
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var r platform.NvidiaPowerBenchResult
+		if err := json.Unmarshal(raw, &r); err != nil {
+			continue
+		}
+		runs = append(runs, powerRun{
+			generatedAt: r.GeneratedAt,
+			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			result:      r,
+		})
+	}
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+
+	var b strings.Builder
+	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
+
+	latest := runs[0].result
+	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
+	if latest.Hostname != "" {
+		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
+	}
+	if latest.OverallStatus != "" {
+		statusColor := "var(--ok)"
+		if latest.OverallStatus != "OK" {
+			statusColor = "var(--warn)"
+		}
+		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
+	}
+	b.WriteString(`</p>`)
+
+	if len(latest.GPUs) > 0 {
+		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`</tr></thead><tbody>`)
+		for _, gpu := range latest.GPUs {
+			finalLimitW := gpu.StablePowerLimitW
+			if finalLimitW <= 0 {
+				finalLimitW = gpu.AppliedPowerLimitW
+			}
+			derated := gpu.Derated ||
+				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
+			rowStyle := ""
+			finalStyle := ""
+			if derated {
+				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
+				finalStyle = ` style="color:#e6a000;font-weight:600"`
+			}
+			statusLabel := gpu.Status
+			if statusLabel == "" {
+				statusLabel = "OK"
+			}
+			statusColor := "var(--ok)"
+			if statusLabel == "FAILED" {
+				statusColor = "var(--crit-fg,#9f3a38)"
+			} else if statusLabel != "OK" {
+				statusColor = "var(--warn)"
+			}
+			nominalStr := "-"
+			if gpu.DefaultPowerLimitW > 0 {
+				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
+			}
+			singleStr := "-"
+			if gpu.AppliedPowerLimitW > 0 {
+				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			multiStr := "-"
+			if gpu.StablePowerLimitW > 0 {
+				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
+			}
+			p95Str := "-"
+			if gpu.MaxObservedPowerW > 0 {
+				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
+			}
+			b.WriteString(`<tr` + rowStyle + `>`)
+			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
+			b.WriteString(`<td>` + nominalStr + `</td>`)
+			b.WriteString(`<td>` + singleStr + `</td>`)
+			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
+			b.WriteString(`<td>` + p95Str + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div>`)
+	}
+
+	if len(runs) > 1 {
+		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
+		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
+		for i, run := range runs {
+			statusColor := "var(--ok)"
+			if run.result.OverallStatus != "OK" {
+				statusColor = "var(--warn)"
+			}
+			b.WriteString(`<tr>`)
+			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div></details>`)
+	}
+
+	b.WriteString(`</div></div>`)
+	return b.String()
+}
@@ -0,0 +1,383 @@
+package webui
+
+func renderBurn() string {
+	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
+<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Burn Profile</div>
+  <div class="card-body burn-profile-body">
+    <div class="burn-profile-col">
+      <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
+    </div>
+    <div class="burn-profile-col burn-profile-action">
+      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
+      <p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
+    </div>
+    <div class="burn-profile-col burn-profile-action">
+      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
+      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
+    </div>
+  </div>
+  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
+    <span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
+    </div>
+	    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+	    </div>
+	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
+	    <div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
+	      <label class="cb-row">
+	        <input type="radio" name="burn-nvidia-mode" value="sequential" checked>
+	        <span>Sequential — selected GPUs one at a time</span>
+	      </label>
+	      <label class="cb-row" id="burn-parallel-label">
+	        <input type="radio" name="burn-nvidia-mode" value="parallel">
+	        <span>Parallel — all selected GPUs simultaneously</span>
+	      </label>
+	      <label class="cb-row" id="burn-ramp-label">
+	        <input type="radio" name="burn-nvidia-mode" value="ramp-up">
+	        <span>Ramp-up — add one GPU at a time</span>
+	      </label>
+	    </div>
+	  </div>
+	</div>
+
+<div class="burn-section">Core Burn Paths</div>
+<div class="grid2 burn-grid" style="margin-bottom:16px">
+<div class="card burn-card">
+  <div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
+  <div class="card-body burn-card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
+    <label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
+  </div>
+</div>
+
+<div class="card burn-card">
+  <div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
+  <div class="card-body burn-card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
+    <label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
+  </div>
+</div>
+</div>
+
+<div id="bi-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Output <span id="bi-title"></span></div>
+  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
+</div>
+
+<style>
+.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
+.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
+.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
+.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
+.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
+.burn-profile-col { min-width:0; }
+.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
+.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
+.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
+.burn-grid { align-items:stretch; }
+.burn-card { height:100%; display:flex; flex-direction:column; }
+.burn-card-body { flex:1; display:flex; flex-direction:column; }
+.card-head-actions { justify-content:space-between; }
+.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
+@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
+</style>
+
+<script>
+let biES = null;
+function burnTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+function burnProfile() {
+  const selected = document.querySelector('input[name="burn-profile"]:checked');
+  return selected ? selected.value : 'smoke';
+}
+function burnSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function burnNvidiaMode() {
+  const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+function burnApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
+}
+function burnUpdateSelectionNote() {
+  const note = document.getElementById('burn-selection-note');
+  const selected = burnSelectedGPUIndices();
+  if (!selected.length) {
+    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
+    return;
+  }
+  note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
+}
+function burnRenderGPUList(gpus) {
+  const root = document.getElementById('burn-gpu-list');
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    burnUpdateSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="burn-gpu-row">'
+      + '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  burnApplyMultiGPUState(gpus.length);
+  burnUpdateSelectionNote();
+}
+function burnSelectAll() {
+  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
+  burnUpdateSelectionNote();
+}
+function burnSelectNone() {
+  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
+  burnUpdateSelectionNote();
+}
+function burnLoadGPUs() {
+  fetch('/api/gpu/nvidia').then(function(r) {
+    return r.json().then(function(body) {
+      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+      return body;
+    });
+  }).then(function(gpus) {
+    burnRenderGPUList(gpus);
+  }).catch(function(err) {
+    document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    burnUpdateSelectionNote();
+  });
+}
+function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
+  const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
+  if (useSelectedNvidia) {
+    const selected = burnSelectedGPUIndices();
+    if (!selected.length) {
+      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
+    }
+    body.gpu_indices = selected;
+    const bMode = burnNvidiaMode();
+    if (bMode === 'ramp-up' && selected.length > 1) {
+      body.stagger_gpu_start = true;
+    } else if (bMode === 'parallel' && selected.length > 1) {
+      body.parallel_gpus = true;
+    }
+  }
+  return fetch('/api/sat/' + target + '/run', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(body)
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  });
+}
+function streamTask(taskId, label) {
+  if (biES) { biES.close(); biES = null; }
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
+  const term = document.getElementById('bi-terminal');
+  term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
+  biES = new EventSource('/api/tasks/' + taskId + '/stream');
+  biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+  biES.addEventListener('done', function(e) {
+    biES.close();
+    biES = null;
+    term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+    term.scrollTop = term.scrollHeight;
+  });
+}
+function streamBurnTask(taskId, label, resetTerminal) {
+  return streamBurnTaskSet([taskId], label, resetTerminal);
+}
+function streamBurnTaskSet(taskIds, label, resetTerminal) {
+  if (biES) { biES.close(); biES = null; }
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
+  const term = document.getElementById('bi-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  if (!Array.isArray(taskIds) || !taskIds.length) {
+    term.textContent += 'ERROR: no tasks queued.\n';
+    return Promise.resolve({ok:false, error:'no tasks queued'});
+  }
+  const streamNext = function(idx, failures) {
+    if (idx >= taskIds.length) {
+      return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
+    }
+    const taskId = taskIds[idx];
+    term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
+    return new Promise(function(resolve) {
+      biES = new EventSource('/api/tasks/' + taskId + '/stream');
+      biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      biES.addEventListener('done', function(e) {
+        biES.close();
+        biES = null;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + (e.data ? 1 : 0));
+      });
+      biES.onerror = function() {
+        if (biES) {
+          biES.close();
+          biES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + 1);
+      };
+    }).then(function(nextFailures) {
+      return streamNext(idx + 1, nextFailures);
+    });
+  };
+  return streamNext(0, 0);
+}
+function runBurnTaskSet(tasks, statusElId) {
+  const enabled = tasks.filter(function(t) {
+    const el = document.getElementById(t.id);
+    return el && el.checked && !el.disabled;
+  });
+  const status = statusElId ? document.getElementById(statusElId) : null;
+  if (status) status.textContent = '';
+  if (!enabled.length) {
+    if (status) status.textContent = 'No tasks selected.';
+    return;
+  }
+  const term = document.getElementById('bi-terminal');
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
+  term.textContent = '';
+  const runNext = function(idx) {
+    if (idx >= enabled.length) {
+      if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
+      return Promise.resolve();
+    }
+    const t = enabled[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
+    if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
+    return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
+      .then(function(d) {
+        return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
+      })
+      .then(function() {
+        return runNext(idx + 1);
+      })
+      .catch(function(err) {
+        if (status) status.textContent = 'Error: ' + err.message;
+        document.getElementById('bi-output').style.display = 'block';
+        term.textContent += 'ERROR: ' + err.message + '\n';
+        return Promise.reject(err);
+      });
+  };
+  return runNext(0);
+}
+function runPlatformStress() {
+  const comps = [];
+  const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
+  const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
+  const hasChecked = function(ids) {
+    return ids.some(function(id) {
+      const el = document.getElementById(id);
+      return el && el.checked && !el.disabled;
+    });
+  };
+  if (hasChecked(computeIDs)) comps.push('cpu');
+  if (hasChecked(gpuIDs)) comps.push('gpu');
+  if (!comps.length) {
+    const status = document.getElementById('burn-all-status');
+    if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
+    return;
+  }
+  const extra = comps.length > 0 ? {platform_components: comps} : {};
+  enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
+    streamTask(d.task_id, 'Platform Thermal Cycling');
+  });
+}
+function runAllBurnTasks() {
+  const status = document.getElementById('burn-all-status');
+  const all = [
+    {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
+    {id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
+    {id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
+    {id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
+    {id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
+    {id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
+    {id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
+  ];
+  status.textContent = 'Enqueuing...';
+  runBurnTaskSet(all, 'burn-all-status');
+}
+fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
+  const map = {
+    'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
+    'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
+    'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
+    'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
+  };
+  tools.forEach(function(t) {
+    const spec = map[t.id];
+    if (!spec) return;
+    const cb = document.getElementById(spec.cb);
+    const note = document.getElementById(spec.note);
+    if (!cb) return;
+    if (t.available) {
+      cb.disabled = false;
+    } else if (note) {
+      note.textContent = '— ' + spec.reason;
+    }
+  });
+}).catch(function() {});
+burnLoadGPUs();
+</script>`
+}
@@ -0,0 +1,510 @@
+package webui
+
+import (
+	"fmt"
+	"html"
+	"net/url"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+func renderExport(exportDir string) string {
+	entries, _ := listExportFiles(exportDir)
+	var rows strings.Builder
+	for _, e := range entries {
+		rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
+			url.QueryEscape(e), html.EscapeString(e)))
+	}
+	if len(entries) == 0 {
+		rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
+	}
+	return `<div class="grid2">
+<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
+<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
+` + renderSupportBundleInline() + `
+</div></div>
+<div class="card"><div class="card-head">Export Files</div><div class="card-body">
+<table><tr><th>File</th></tr>` + rows.String() + `</table>
+</div></div>
+</div>
+
+` + renderUSBExportCard()
+}
+
+func listExportFiles(exportDir string) ([]string, error) {
+	var entries []string
+	err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info.IsDir() {
+			return nil
+		}
+		rel, err := filepath.Rel(exportDir, path)
+		if err != nil {
+			return err
+		}
+		entries = append(entries, rel)
+		return nil
+	})
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+	sort.Strings(entries)
+	return entries, nil
+}
+
+func renderSupportBundleInline() string {
+	return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">&#8595; Download Support Bundle</button>
+<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
+<script>
+window.supportBundleDownload = function() {
+  var btn = document.getElementById('support-bundle-btn');
+  var status = document.getElementById('support-bundle-status');
+  btn.disabled = true;
+  btn.textContent = 'Building...';
+  status.textContent = 'Collecting logs and export data\u2026';
+  status.style.color = 'var(--muted)';
+  var filename = 'bee-support.tar.gz';
+  fetch('/export/support.tar.gz')
+    .then(function(r) {
+      if (!r.ok) throw new Error('HTTP ' + r.status);
+      var cd = r.headers.get('Content-Disposition') || '';
+      var m = cd.match(/filename="?([^";]+)"?/);
+      if (m) filename = m[1];
+      return r.blob();
+    })
+    .then(function(blob) {
+      var url = URL.createObjectURL(blob);
+      var a = document.createElement('a');
+      a.href = url;
+      a.download = filename;
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+      status.textContent = 'Download started.';
+      status.style.color = 'var(--ok-fg)';
+    })
+    .catch(function(e) {
+      status.textContent = 'Error: ' + e.message;
+      status.style.color = 'var(--crit-fg)';
+    })
+    .finally(function() {
+      btn.disabled = false;
+      btn.textContent = '\u2195 Download Support Bundle';
+    });
+};
+</script>`
+}
+
+func renderUSBExportCard() string {
+	return `<div class="card" style="margin-top:16px">
+  <div class="card-head">USB Black-Box
+    <button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">&#8635; Refresh</button>
+  </div>
+  <div class="card-body">` + renderUSBExportInline() + `</div>
+</div>`
+}
+
+func renderUSBExportInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
+<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
+<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
+<div id="usb-targets" style="margin-top:12px"></div>
+<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
+<script>
+(function(){
+function blackboxRefresh() {
+  document.getElementById('usb-status').textContent = 'Scanning...';
+  document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
+  document.getElementById('usb-targets').innerHTML = '';
+  document.getElementById('usb-msg').textContent = '';
+  Promise.all([
+    fetch('/api/export/usb').then(r=>r.json()),
+    fetch('/api/blackbox/status').then(r=>r.json())
+  ]).then(function(values) {
+    const targets = Array.isArray(values[0]) ? values[0] : [];
+    const state = values[1] || {};
+    const active = Array.isArray(state.targets) ? state.targets : [];
+    window._usbTargets = targets;
+    window._blackboxTargets = active;
+    const st = document.getElementById('usb-status');
+    const ct = document.getElementById('usb-targets');
+    const summary = document.getElementById('blackbox-summary');
+    if (state.boot_folder) {
+      summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
+    } else {
+      summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
+    }
+    if (!targets || targets.length === 0) {
+      st.textContent = 'No removable USB devices found.';
+    } else {
+      st.textContent = targets.length + ' device(s) found:';
+    }
+    const byDevice = {};
+    active.forEach(function(item) { byDevice[item.device] = item; });
+    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
+      targets.map((t, idx) => {
+        const dev = t.device || '';
+        const label = t.label || '';
+        const model = t.model || '';
+        const state = byDevice[dev];
+        const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
+        const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
+        return '<tr>' +
+          '<td style="font-family:monospace">'+dev+'</td>' +
+          '<td>'+t.fs_type+'</td>' +
+          '<td>'+t.size+'</td>' +
+          '<td>'+label+'</td>' +
+          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
+          '<td style="font-size:12px">'+status+detail+'</td>' +
+          '<td style="white-space:nowrap">' +
+            (state
+              ? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
+              : '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
+            '<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
+          '</td></tr>';
+      }).join('') + '</table>';
+  }).catch(e => {
+    document.getElementById('usb-status').textContent = 'Error: ' + e;
+  });
+}
+window.blackboxEnable = function(targetIndex, btn) {
+  const target = (window._usbTargets || [])[targetIndex];
+  if (!target) {
+    const msg = document.getElementById('usb-msg');
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: USB target not found. Refresh and try again.';
+    return;
+  }
+  const msg = document.getElementById('usb-msg');
+  const row = btn ? btn.closest('td') : null;
+  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
+  const originalText = btn ? btn.textContent : '';
+  if (btn) {
+    btn.disabled = true;
+    btn.textContent = 'Enabling...';
+  }
+  if (rowMsg) {
+    rowMsg.style.color = 'var(--muted)';
+    rowMsg.textContent = 'Working...';
+  }
+  msg.style.color = 'var(--muted)';
+  msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
+  fetch('/api/blackbox/enable', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(target)
+  }).then(async r => {
+    const d = await r.json();
+    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
+    return d;
+  }).then(d => {
+    msg.style.color = 'var(--ok,green)';
+    msg.textContent = d.message || 'Done.';
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--ok,green)';
+      rowMsg.textContent = d.message || 'Done.';
+    }
+  }).catch(e => {
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: '+e;
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--err,red)';
+      rowMsg.textContent = 'Error: ' + e;
+    }
+  }).finally(() => {
+    if (btn) {
+      btn.disabled = false;
+      btn.textContent = originalText;
+    }
+    setTimeout(blackboxRefresh, 300);
+  });
+};
+window.blackboxDisable = function(targetIndex, btn) {
+  const target = (window._usbTargets || [])[targetIndex];
+  const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
+  if (!target || !active) {
+    const msg = document.getElementById('usb-msg');
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: black-box target not found. Refresh and try again.';
+    return;
+  }
+  const msg = document.getElementById('usb-msg');
+  const row = btn ? btn.closest('td') : null;
+  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
+  const originalText = btn ? btn.textContent : '';
+  if (btn) {
+    btn.disabled = true;
+    btn.textContent = 'Disabling...';
+  }
+  if (rowMsg) {
+    rowMsg.style.color = 'var(--muted)';
+    rowMsg.textContent = 'Working...';
+  }
+  msg.style.color = 'var(--muted)';
+  msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
+  fetch('/api/blackbox/disable', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
+  }).then(async r => {
+    const d = await r.json();
+    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
+    return d;
+  }).then(d => {
+    msg.style.color = 'var(--ok,green)';
+    msg.textContent = d.message || 'Done.';
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--ok,green)';
+      rowMsg.textContent = d.message || 'Done.';
+    }
+  }).catch(e => {
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: '+e;
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--err,red)';
+      rowMsg.textContent = 'Error: ' + e;
+    }
+  }).finally(() => {
+    if (btn) {
+      btn.disabled = false;
+      btn.textContent = originalText;
+    }
+    setTimeout(blackboxRefresh, 300);
+  });
+};
+window.blackboxRefresh = blackboxRefresh;
+blackboxRefresh();
+})();
+</script>`
+}
+
+func renderNvidiaSelfHealInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
+<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
+  <button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
+  <button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">&#8635; Refresh</button>
+</div>
+<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
+<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
+    <span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
+    <span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
+  </div>
+  <div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
+</div>
+<script>
+function nvidiaSelfHealShowResult(label, status, output) {
+  var out = document.getElementById('nvidia-self-heal-out');
+  var term = document.getElementById('nvidia-self-heal-terminal');
+  var statusEl = document.getElementById('nvidia-self-heal-out-status');
+  var labelEl = document.getElementById('nvidia-self-heal-out-label');
+  out.style.display = 'block';
+  labelEl.textContent = label;
+  term.textContent = output || '(no output)';
+  term.scrollTop = term.scrollHeight;
+  if (status === 'ok') {
+    statusEl.textContent = '✓ done';
+    statusEl.style.color = 'var(--ok-fg, #2c662d)';
+  } else {
+    statusEl.textContent = '✗ failed';
+    statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+  }
+}
+function nvidiaRestartDrivers() {
+  var btn = document.getElementById('nvidia-restart-btn');
+  var original = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = 'Restarting...';
+  nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
+  fetch('/api/services/action', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({name:'bee-nvidia', action:'restart'})
+  }).then(r=>r.json()).then(d => {
+    nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
+    setTimeout(function() {
+      loadServices();
+      loadNvidiaSelfHeal();
+    }, 800);
+  }).catch(e => {
+    nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
+  }).finally(() => {
+    btn.disabled = false;
+    btn.textContent = original;
+  });
+}
+function nvidiaResetGPU(index, btn) {
+  var original = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = 'Resetting...';
+  nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
+  fetch('/api/gpu/nvidia-reset', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({index:index})
+  }).then(r=>r.json()).then(d => {
+    nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
+    setTimeout(loadNvidiaSelfHeal, 1000);
+  }).catch(e => {
+    nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
+  }).finally(() => {
+    btn.disabled = false;
+    btn.textContent = original;
+  });
+}
+function loadNvidiaSelfHeal() {
+  var status = document.getElementById('nvidia-self-heal-status');
+  var table = document.getElementById('nvidia-self-heal-table');
+  status.textContent = 'Loading NVIDIA GPU status...';
+  status.style.color = 'var(--muted)';
+  table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
+  fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
+    if (!Array.isArray(gpus) || gpus.length === 0) {
+      status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
+      table.innerHTML = '';
+      return;
+    }
+    status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
+    const rows = gpus.map(g => {
+      const serial = g.serial || '';
+      const bdf = g.bdf || '';
+      const id = serial || bdf || ('gpu-' + g.index);
+      const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
+      const details = [];
+      if (serial) details.push('serial ' + serial);
+      if (bdf) details.push('bdf ' + bdf);
+      if (g.parse_failure && g.raw_line) details.push(g.raw_line);
+      return '<tr>'
+        + '<td style="white-space:nowrap">' + g.index + '</td>'
+        + '<td>' + (g.name || 'unknown') + '</td>'
+        + '<td style="font-family:monospace">' + id + '</td>'
+        + '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
+        + (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
+        + '</td>'
+        + '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
+        + '</tr>';
+    }).join('');
+    table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
+  }).catch(e => {
+    status.textContent = 'Error loading NVIDIA GPU status: ' + e;
+    status.style.color = 'var(--crit-fg, #9f3a38)';
+    table.innerHTML = '';
+  });
+}
+loadNvidiaSelfHeal();
+</script>`
+}
+
+func renderTools() string {
+	return `<div class="card" style="margin-bottom:16px">
+  <div class="card-head">System Install</div>
+  <div class="card-body">
+    <div style="margin-bottom:20px">
+    <div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
+    <p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
+    <p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
+    <button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">&#9654; Copy to RAM</button>
+    </div>
+    <div style="border-top:1px solid var(--line);padding-top:20px">
+    <div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
+		renderInstallInline() + `
+    </div>
+  </div>
+</div>
+<script>
+fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
+  const boot = document.getElementById('boot-source-text');
+  const txt = document.getElementById('ram-status-text');
+  const btn = document.getElementById('ram-install-btn');
+  let source = d.device || d.source || 'unknown source';
+  let kind = d.kind || 'unknown';
+  let label = source;
+  if (kind === 'ram') label = 'RAM';
+  else if (kind === 'usb') label = 'USB (' + source + ')';
+  else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
+  else if (kind === 'disk') label = 'disk (' + source + ')';
+  else label = source;
+  boot.textContent = 'Current boot source: ' + label + '.';
+  txt.textContent = d.message || 'Checking...';
+  if (d.status === 'ok' || d.in_ram) {
+    txt.style.color = 'var(--ok, green)';
+  } else if (d.status === 'failed') {
+    txt.style.color = 'var(--err, #b91c1c)';
+  } else {
+    txt.style.color = 'var(--muted)';
+  }
+  if (d.can_start_task) {
+    btn.style.display = '';
+    btn.disabled = false;
+  } else {
+    btn.style.display = 'none';
+  }
+});
+function installToRAM() {
+  document.getElementById('ram-install-btn').disabled = true;
+  fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
+    window.location.href = '/tasks#' + d.task_id;
+  });
+}
+</script>
+
+<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
+<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
+` + renderSupportBundleInline() + `
+<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
+  <div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
+  ` + renderUSBExportInline() + `
+</div>
+</div></div>
+
+<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
+<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
+
+<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
+		renderNvidiaSelfHealInline() + `</div></div>
+
+<div class="card"><div class="card-head">Network</div><div class="card-body">` +
+		renderNetworkInline() + `</div></div>
+
+<div class="card"><div class="card-head">Services</div><div class="card-body">` +
+		renderServicesInline() + `</div></div>
+
+
+<script>
+function checkTools() {
+  document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
+  fetch('/api/tools/check').then(r=>r.json()).then(tools => {
+    const rows = tools.map(t =>
+      '<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '&#10003; '+t.Path : '&#10007; missing')+'</span></td></tr>'
+    ).join('');
+    document.getElementById('tools-table').innerHTML =
+      '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
+  });
+}
+checkTools();
+</script>`
+}
+
+func renderExportIndex(exportDir string) (string, error) {
+	entries, err := listExportFiles(exportDir)
+	if err != nil {
+		return "", err
+	}
+	var body strings.Builder
+	body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
+	body.WriteString(`<h1>Bee Export Files</h1><ul>`)
+	for _, entry := range entries {
+		body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
+	}
+	if len(entries) == 0 {
+		body.WriteString(`<li>No export files found.</li>`)
+	}
+	body.WriteString(`</ul></body></html>`)
+	return body.String(), nil
+}
@@ -0,0 +1,314 @@
+package webui
+
+func renderInstallInline() string {
+	return `
+    <div class="alert alert-warn" style="margin-bottom:16px">
+      <strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
+      disk and write the live system onto it. All existing data on the target disk will be lost.
+      This operation cannot be undone.
+    </div>
+    <div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
+    <div id="install-disk-section" style="display:none">
+      <div class="card" style="margin-bottom:0">
+        <table id="install-disk-table">
+          <thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
+          <tbody id="install-disk-tbody"></tbody>
+        </table>
+      </div>
+      <div style="margin-top:12px">
+        <button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
+      </div>
+    </div>
+    <div id="install-confirm-section" style="display:none;margin-top:20px">
+      <div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
+      <div class="form-row" style="max-width:360px">
+        <label>Type the device name to confirm (e.g. /dev/sda)</label>
+        <input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
+      </div>
+      <button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
+      <button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
+    </div>
+    <div id="install-progress-section" style="display:none;margin-top:20px">
+      <div class="card-head" style="margin-bottom:8px">Installation Progress</div>
+      <div id="install-terminal" class="terminal" style="max-height:500px"></div>
+      <div id="install-status" style="margin-top:12px;font-size:13px"></div>
+    </div>
+
+<style>
+#install-disk-tbody tr{cursor:pointer}
+#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
+#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
+</style>
+
+<script>
+var _installSelected = null;
+
+function installRefreshDisks() {
+  document.getElementById('install-loading').style.display = '';
+  document.getElementById('install-disk-section').style.display = 'none';
+  document.getElementById('install-confirm-section').style.display = 'none';
+  _installSelected = null;
+  fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
+    document.getElementById('install-loading').style.display = 'none';
+    var tbody = document.getElementById('install-disk-tbody');
+    tbody.innerHTML = '';
+    if (!disks || disks.length === 0) {
+      tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
+    } else {
+      disks.forEach(function(d) {
+        var warnings = (d.warnings || []);
+        var statusHtml;
+        if (warnings.length === 0) {
+          statusHtml = '<span class="badge badge-ok">OK</span>';
+        } else {
+          var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
+          statusHtml = warnings.map(function(w){
+            var cls = hasSmall ? 'badge-err' : 'badge-warn';
+            return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'&quot;') + '">' +
+              (w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
+          }).join(' ');
+        }
+        var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
+          ? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
+        var tr = document.createElement('tr');
+        tr.dataset.device = d.device;
+        tr.dataset.model = d.model || 'Unknown';
+        tr.dataset.size = d.size;
+        tr.dataset.warnings = JSON.stringify(warnings);
+        tr.innerHTML =
+          '<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
+          '<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
+          '<td>' + (d.model || '—') + '</td>' +
+          '<td>' + d.size + '</td>' +
+          '<td>' + statusHtml + '</td>';
+        tr.addEventListener('click', function(){ installSelectDisk(this); });
+        tbody.appendChild(tr);
+      });
+    }
+    document.getElementById('install-disk-section').style.display = '';
+  }).catch(function(e){
+    document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
+  });
+}
+
+function installSelectDisk(tr) {
+  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
+  tr.classList.add('selected');
+  var radio = tr.querySelector('input[type=radio]');
+  if (radio) radio.checked = true;
+  _installSelected = {
+    device: tr.dataset.device,
+    model: tr.dataset.model,
+    size: tr.dataset.size,
+    warnings: JSON.parse(tr.dataset.warnings || '[]')
+  };
+  var warnBox = document.getElementById('install-confirm-warn');
+  var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
+    ' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
+    ' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
+  if (_installSelected.warnings.length > 0) {
+    warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
+  }
+  warnBox.innerHTML = warnLines;
+  document.getElementById('install-confirm-input').value = '';
+  document.getElementById('install-start-btn').disabled = true;
+  document.getElementById('install-confirm-section').style.display = '';
+  document.getElementById('install-progress-section').style.display = 'none';
+}
+
+function installDeselect() {
+  _installSelected = null;
+  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
+  document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
+  document.getElementById('install-confirm-section').style.display = 'none';
+}
+
+function installCheckConfirm() {
+  var val = document.getElementById('install-confirm-input').value.trim();
+  var ok = _installSelected && val === _installSelected.device;
+  document.getElementById('install-start-btn').disabled = !ok;
+}
+
+function installStart() {
+  if (!_installSelected) return;
+  document.getElementById('install-confirm-section').style.display = 'none';
+  document.getElementById('install-disk-section').style.display = 'none';
+  document.getElementById('install-loading').style.display = 'none';
+  var prog = document.getElementById('install-progress-section');
+  var term = document.getElementById('install-terminal');
+  var status = document.getElementById('install-status');
+  prog.style.display = '';
+  term.textContent = '';
+  status.textContent = 'Starting installation…';
+  status.style.color = 'var(--muted)';
+
+  fetch('/api/install/run', {
+    method: 'POST',
+    headers: {'Content-Type': 'application/json'},
+    body: JSON.stringify({device: _installSelected.device})
+  }).then(function(r){
+    return r.json().then(function(j){
+      if (!r.ok) throw new Error(j.error || r.statusText);
+      return j;
+    });
+  }).then(function(j){
+    if (!j.task_id) throw new Error('missing task id');
+    installStreamLog(j.task_id);
+  }).catch(function(e){
+    status.textContent = 'Error: ' + e;
+    status.style.color = 'var(--crit-fg)';
+  });
+}
+
+function installStreamLog(taskId) {
+  var term = document.getElementById('install-terminal');
+  var status = document.getElementById('install-status');
+  var es = new EventSource('/api/tasks/' + taskId + '/stream');
+  es.onmessage = function(e) {
+    term.textContent += e.data + '\n';
+    term.scrollTop = term.scrollHeight;
+  };
+  es.addEventListener('done', function(e) {
+    es.close();
+    if (!e.data) {
+      status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
+      var rebootBtn = document.createElement('button');
+      rebootBtn.className = 'btn btn-primary btn-sm';
+      rebootBtn.style.marginLeft = '12px';
+      rebootBtn.textContent = 'Reboot now';
+      rebootBtn.onclick = function(){
+        fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
+          body: JSON.stringify({name:'', action:'reboot'})});
+      };
+      status.appendChild(rebootBtn);
+    } else {
+      status.textContent = '✗ Installation failed: ' + e.data;
+      status.style.color = 'var(--crit-fg)';
+    }
+  });
+  es.onerror = function() {
+    es.close();
+    status.textContent = '✗ Stream disconnected.';
+    status.style.color = 'var(--crit-fg)';
+  };
+}
+
+installRefreshDisks();
+</script>
+`
+}
+
+func renderInstall() string {
+	return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
+		renderInstallInline() +
+		`</div></div>`
+}
+
+func renderTasks() string {
+	return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
+<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
+<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
+<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
+<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
+</div>
+<div class="card">
+<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
+</div>
+<script>
+var _taskRefreshTimer = null;
+var _tasksAll = [];
+var _taskPage = 1;
+var _taskPageSize = 50;
+
+function loadTasks() {
+  fetch('/api/tasks').then(r=>r.json()).then(tasks => {
+    _tasksAll = Array.isArray(tasks) ? tasks : [];
+    if (_tasksAll.length === 0) {
+      _taskPage = 1;
+      document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
+      return;
+    }
+    const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+    if (_taskPage > totalPages) _taskPage = totalPages;
+    if (_taskPage < 1) _taskPage = 1;
+    const start = (_taskPage - 1) * _taskPageSize;
+    const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
+    const rows = pageTasks.map(t => {
+      const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
+      const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
+      const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status;
+      let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
+      if (t.status === 'running' || t.status === 'pending') {
+        actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
+      }
+      if (t.status === 'pending') {
+        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">&#8679;</button>';
+        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">&#8681;</button>';
+      }
+      return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
+        '<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
+        '<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
+        '<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
+        '<td>'+t.priority+'</td>' +
+        '<td>'+actions+'</td></tr>';
+    }).join('');
+    const showingFrom = start + 1;
+    const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
+    const pager =
+      '<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
+        '<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
+        '<div style="display:flex;align-items:center;gap:8px">' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
+          '<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
+        '</div>' +
+      '</div>';
+    document.getElementById('tasks-table').innerHTML =
+      '<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
+  });
+}
+
+function escHtml(s) { return (s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/"/g,'&quot;'); }
+function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
+function formatDurSec(sec) {
+  sec = Math.max(0, Math.round(sec||0));
+  if (sec < 60) return sec+'s';
+  const m = Math.floor(sec/60), ss = sec%60;
+  return m+'m '+ss+'s';
+}
+function setTaskPage(page) {
+  const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+  _taskPage = Math.min(totalPages, Math.max(1, page));
+  loadTasks();
+}
+
+function cancelTask(id) {
+  fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
+}
+function cancelAll() {
+  fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
+}
+function killWorkers() {
+  if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
+  fetch('/api/tasks/kill-workers',{method:'POST'})
+    .then(r=>r.json())
+    .then(d=>{
+      loadTasks();
+      var toast = document.getElementById('kill-toast');
+      var parts = [];
+      if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
+      if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
+      toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
+      toast.style.display = '';
+      setTimeout(()=>{ toast.style.display='none'; }, 5000);
+    });
+}
+function setPriority(id, delta) {
+  fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
+    .then(()=>loadTasks());
+}
+
+loadTasks();
+_taskRefreshTimer = setInterval(loadTasks, 2000);
+</script>`
+}
@@ -0,0 +1,238 @@
+package webui
+
+func renderMetrics() string {
+	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Server — Load</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Temperature — CPU</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
+  </div>
+</div>
+
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Temperature — Ambient Sensors</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Server — Power</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
+  </div>
+</div>
+
+<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
+  <div class="card-head">Server — Fan RPM</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
+  </div>
+</div>
+
+<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
+  <div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
+    <div>
+      <div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
+      <div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
+    </div>
+    <label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
+      <input id="gpu-chart-toggle" type="checkbox">
+      <span>One chart per GPU</span>
+    </label>
+  </div>
+
+  <div id="gpu-metrics-by-metric">
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Compute Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Memory Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Core Clock</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Power</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Temperature</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
+      </div>
+    </div>
+  </div>
+
+  <div id="gpu-metrics-by-gpu" style="display:none"></div>
+</section>
+
+<script>
+let gpuChartKey = '';
+const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
+let metricsNvidiaGPUsPromise = null;
+
+function loadMetricsNvidiaGPUs() {
+  if (!metricsNvidiaGPUsPromise) {
+    metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
+      .then(function(r) {
+        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
+        return r.json();
+      })
+      .then(function(list) { return Array.isArray(list) ? list : []; })
+      .catch(function() { return []; });
+  }
+  return metricsNvidiaGPUsPromise;
+}
+
+function metricsGPUNameMap(list) {
+  const out = {};
+  (list || []).forEach(function(gpu) {
+    const idx = Number(gpu.index);
+    if (!Number.isFinite(idx) || !gpu.name) return;
+    out[idx] = gpu.name;
+  });
+  return out;
+}
+
+function metricsGPUDisplayLabel(idx, names) {
+  const name = names && names[idx];
+  return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
+}
+
+function loadGPUChartModePreference() {
+  try {
+    return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
+  } catch (_) {
+    return false;
+  }
+}
+
+function saveGPUChartModePreference(perGPU) {
+  try {
+    sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
+  } catch (_) {}
+}
+
+function refreshChartImage(el) {
+  if (!el || el.dataset.loading === '1') return;
+  if (el.offsetParent === null) return;
+  const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
+  const nextSrc = baseSrc + '?t=' + Date.now();
+  const probe = new Image();
+  el.dataset.baseSrc = baseSrc;
+  el.dataset.loading = '1';
+  probe.onload = function() {
+    el.src = nextSrc;
+    el.dataset.loading = '0';
+  };
+  probe.onerror = function() {
+    el.dataset.loading = '0';
+  };
+  probe.src = nextSrc;
+}
+
+function refreshCharts() {
+  document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
+}
+
+function gpuIndices(rows) {
+  const seen = {};
+  const out = [];
+  (rows || []).forEach(function(row) {
+    const idx = Number(row.index);
+    if (!Number.isFinite(idx) || seen[idx]) return;
+    seen[idx] = true;
+    out.push(idx);
+  });
+  return out.sort(function(a, b) { return a - b; });
+}
+
+function renderGPUOverviewCards(indices, names) {
+  const host = document.getElementById('gpu-metrics-by-gpu');
+  if (!host) return;
+  host.innerHTML = indices.map(function(idx) {
+    const label = metricsGPUDisplayLabel(idx, names);
+    return '<div class="card" style="margin-bottom:16px">' +
+      '<div class="card-head">' + label + ' — Overview</div>' +
+      '<div class="card-body" style="padding:8px">' +
+      '<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
+      '</div></div>';
+  }).join('');
+}
+
+function applyGPUChartMode() {
+  const perMetric = document.getElementById('gpu-metrics-by-metric');
+  const perGPU = document.getElementById('gpu-metrics-by-gpu');
+  const toggle = document.getElementById('gpu-chart-toggle');
+  const gpuModePerGPU = !!(toggle && toggle.checked);
+  if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
+  if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
+}
+
+function syncMetricsLayout(d) {
+  const fanCard = document.getElementById('card-server-fans');
+  if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
+  const section = document.getElementById('gpu-metrics-section');
+  const summary = document.getElementById('gpu-metrics-summary');
+  const indices = gpuIndices(d.gpus);
+  loadMetricsNvidiaGPUs().then(function(gpus) {
+    const names = metricsGPUNameMap(gpus);
+    if (section) section.style.display = indices.length > 0 ? '' : 'none';
+    if (summary) {
+      summary.textContent = indices.length > 0
+        ? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
+        : 'No GPUs detected in live metrics.';
+    }
+    const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
+    if (nextKey !== gpuChartKey) {
+      renderGPUOverviewCards(indices, names);
+      gpuChartKey = nextKey;
+    }
+    applyGPUChartMode();
+  });
+}
+
+function loadMetricsLayout() {
+  fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
+}
+
+const gpuChartToggle = document.getElementById('gpu-chart-toggle');
+if (gpuChartToggle) {
+  gpuChartToggle.checked = loadGPUChartModePreference();
+}
+applyGPUChartMode();
+
+if (gpuChartToggle) {
+  gpuChartToggle.addEventListener('change', function() {
+    saveGPUChartModePreference(!!gpuChartToggle.checked);
+    applyGPUChartMode();
+    refreshCharts();
+  });
+}
+
+loadMetricsLayout();
+setInterval(refreshCharts, 3000);
+setInterval(loadMetricsLayout, 5000);
+</script>`
+}
@@ -0,0 +1,213 @@
+package webui
+
+import "html"
+
+// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
+func renderNetworkInline() string {
+	return `<div id="net-pending" style="display:none" class="alert alert-warn">
+<strong>&#9888; Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
+<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
+<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
+</div>
+<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div class="grid2" style="margin-top:16px">
+<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
+<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
+<button class="btn btn-primary" onclick="runDHCP()">&#9654; Run DHCP</button>
+<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
+</div>
+<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
+<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
+<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
+<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
+<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
+<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
+<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
+<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
+</div>
+</div>
+<script>
+var _netCountdownTimer = null;
+var _netRefreshTimer = null;
+const NET_ROLLBACK_SECS = 60;
+function loadNetwork() {
+  fetch('/api/network').then(r=>r.json()).then(d => {
+    const rows = (d.interfaces||[]).map(i =>
+      '<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
+      '<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
+      '<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
+    ).join('');
+    document.getElementById('iface-table').innerHTML =
+      '<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
+      (d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
+    if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    else hideNetPending();
+  }).catch(function() {});
+}
+function selectIface(iface) {
+  document.getElementById('dhcp-iface').value = iface;
+  document.getElementById('st-iface').value = iface;
+}
+function toggleIface(iface, currentState) {
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
+    .then(r=>r.json()).then(d => {
+      if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
+      loadNetwork();
+      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    }).catch(function() {
+      setTimeout(loadNetwork, 1500);
+    });
+}
+function hideNetPending() {
+  const el = document.getElementById('net-pending');
+  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
+  _netCountdownTimer = null;
+  el.style.display = 'none';
+}
+function showNetPending(secs) {
+  if (!secs || secs < 1) { hideNetPending(); return; }
+  const el = document.getElementById('net-pending');
+  el.style.display = 'block';
+  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
+  let remaining = secs;
+  document.getElementById('net-countdown').textContent = remaining;
+  _netCountdownTimer = setInterval(function() {
+    remaining--;
+    document.getElementById('net-countdown').textContent = remaining;
+    if (remaining <= 0) { hideNetPending(); loadNetwork(); }
+  }, 1000);
+}
+function confirmNetChange() {
+  hideNetPending();
+  fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
+}
+function rollbackNetChange() {
+  hideNetPending();
+  fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
+}
+function runDHCP() {
+  const iface = document.getElementById('dhcp-iface').value.trim();
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
+    .then(r=>r.json()).then(d => {
+      document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
+      if (d.error) { hideNetPending(); return; }
+      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+      loadNetwork();
+    }).catch(function() {
+      setTimeout(loadNetwork, 1500);
+    });
+}
+function setStatic() {
+  const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
+    interface: document.getElementById('st-iface').value,
+    address: document.getElementById('st-addr').value,
+    prefix: document.getElementById('st-prefix').value,
+    gateway: document.getElementById('st-gw').value,
+    dns: dns,
+  })}).then(r=>r.json()).then(d => {
+    document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
+    if (d.error) { hideNetPending(); return; }
+    showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    loadNetwork();
+  }).catch(function() {
+    setTimeout(loadNetwork, 1500);
+  });
+}
+loadNetwork();
+if (_netRefreshTimer) clearInterval(_netRefreshTimer);
+_netRefreshTimer = setInterval(loadNetwork, 5000);
+</script>`
+}
+
+func renderNetwork() string {
+	return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
+		renderNetworkInline() +
+		`</div></div>`
+}
+
+func renderServicesInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
+<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
+<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div id="svc-out" style="display:none;margin-top:12px">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
+    <span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
+    <span id="svc-out-status" style="font-size:12px"></span>
+  </div>
+  <div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
+</div>
+<script>
+function loadServices() {
+  fetch('/api/services').then(r=>r.json()).then(svcs => {
+    const rows = svcs.map(s => {
+      const st = s.state||'unknown';
+      const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
+      const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
+      const body = (s.body||'').replace(/</g,'&lt;').replace(/>/g,'&gt;');
+      return '<tr>' +
+        '<td style="white-space:nowrap">'+s.name+'</td>' +
+        '<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
+        '<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
+        '</td>' +
+        '<td style="white-space:nowrap">' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start"   onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop"    onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
+        '</td></tr>';
+    }).join('');
+    document.getElementById('svc-table').innerHTML =
+      '<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
+  });
+}
+function toggleBody(id) {
+  const el = document.getElementById(id);
+  if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
+}
+function svcAction(btn, name, action) {
+  var label = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = '...';
+  var out = document.getElementById('svc-out');
+  var term = document.getElementById('svc-terminal');
+  var statusEl = document.getElementById('svc-out-status');
+  var labelEl = document.getElementById('svc-out-label');
+  out.style.display = 'block';
+  labelEl.textContent = action + ' ' + name;
+  term.textContent = 'Running...';
+  statusEl.textContent = '';
+  statusEl.style.color = '';
+  fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
+    .then(r=>r.json()).then(d => {
+      term.textContent = d.output || d.error || '(no output)';
+      term.scrollTop = term.scrollHeight;
+      if (d.status === 'ok') {
+        statusEl.textContent = '✓ done';
+        statusEl.style.color = 'var(--ok-fg, #2c662d)';
+      } else {
+        statusEl.textContent = '✗ failed';
+        statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      }
+      btn.textContent = label;
+      btn.disabled = false;
+      setTimeout(loadServices, 800);
+    }).catch(e => {
+      term.textContent = 'Request failed: ' + e;
+      statusEl.textContent = '✗ error';
+      statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      btn.textContent = label;
+      btn.disabled = false;
+    });
+}
+loadServices();
+</script>`
+}
+
+func renderServices() string {
+	return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
+		renderServicesInline() +
+		`</div></div>`
+}
@@ -0,0 +1,663 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"sort"
+	"strings"
+
+	"bee/audit/internal/platform"
+	"bee/audit/internal/schema"
+)
+
+type validateInventory struct {
+	CPU            string
+	Memory         string
+	Storage        string
+	NVIDIA         string
+	AMD            string
+	NvidiaGPUCount int
+	AMDGPUCount    int
+}
+
+func validateFmtDur(secs int) string {
+	if secs < 120 {
+		return fmt.Sprintf("~%d s", secs)
+	}
+	mins := (secs + 29) / 60
+	return fmt.Sprintf("~%d min", mins)
+}
+
+func validateTotalValidateSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUValidateSec +
+		platform.SATEstimatedMemoryValidateSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUValidateSec
+	}
+	return total
+}
+
+func validateTotalStressSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUStressSec +
+		platform.SATEstimatedMemoryStressSec +
+		platform.SATEstimatedNvidiaPulseTestSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUStressSec +
+			platform.SATEstimatedNvidiaTargetedStressSec +
+			platform.SATEstimatedNvidiaTargetedPowerSec
+	}
+	return total
+}
+
+func renderValidate(opts HandlerOptions) string {
+	inv := loadValidateInventory(opts)
+	n := inv.NvidiaGPUCount
+	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
+	stressTotalStr := validateFmtDur(validateTotalStressSec(n))
+	gpuNote := ""
+	if n > 0 {
+		gpuNote = fmt.Sprintf(" (%d GPU)", n)
+	}
+	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
+<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+	<div class="card" style="margin-bottom:16px">
+	  <div class="card-head">Validate Profile</div>
+	  <div class="card-body validate-profile-body">
+	    <div class="validate-profile-col">
+	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
+	    </div>
+	    <div class="validate-profile-col validate-profile-action">
+	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
+	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
+	      <div style="margin-top:12px">
+	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
+	      </div>
+	    </div>
+	  </div>
+	</div>
+
+<div class="grid3">
+` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
+		inv.CPU,
+		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
+		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
+		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
+	)) +
+		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
+			inv.Memory,
+			`Runs a RAM validation pass and records memory state around the test.`,
+			`<code>free</code>, <code>memtester</code>`,
+			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
+		)) +
+		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
+			inv.Storage,
+			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
+			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
+			`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
+		)) +
+		`</div>
+<div style="height:1px;background:var(--border);margin:16px 0"></div>
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
+    </div>
+    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+    </div>
+    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
+  </div>
+</div>
+
+<div class="grid3">
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
+		inv.NVIDIA,
+		`Runs NVIDIA diagnostics and board inventory checks.`,
+		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
+		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
+			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
+			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
+	)) +
+		`<div id="sat-card-nvidia-targeted-stress">` +
+		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
+			`<code>dcgmi diag targeted_stress</code>`,
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-targeted-power">` +
+		renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
+			`<code>dcgmi diag targeted_power</code>`,
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-pulse">` +
+		renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
+			`<code>dcgmi diag pulse_test</code>`,
+			`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-interconnect">` +
+		renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
+			`<code>all_reduce_perf</code> (NCCL tests)`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-bandwidth">` +
+		renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
+			`<code>nvbandwidth</code>`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
+		)) +
+		`</div>` +
+		`</div>
+<div class="grid3" style="margin-top:16px">
+` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
+		inv.AMD,
+		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
+		`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
+		`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
+	)) +
+		`</div>
+<div id="sat-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Test Output <span id="sat-title"></span></div>
+  <div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
+</div>
+<style>
+.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
+.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
+.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
+.validate-card-body { padding:0; }
+.validate-card-section { padding:12px 16px 0; }
+.validate-card-section:last-child { padding-bottom:16px; }
+.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
+</style>
+<script>
+let satES = null;
+function satStressMode() {
+  return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
+}
+function satModeChanged() {
+  const stress = satStressMode();
+  [
+    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
+    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
+    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
+  ].forEach(function(item) {
+    const card = document.getElementById(item.card);
+    if (card) {
+      card.style.opacity = stress ? '1' : '0.5';
+      const hint = document.getElementById(item.hint);
+      if (hint) hint.style.display = stress ? 'none' : '';
+    }
+  });
+}
+function satLabels() {
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+}
+let satNvidiaGPUsPromise = null;
+function loadSatNvidiaGPUs() {
+  if (!satNvidiaGPUsPromise) {
+    satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
+      .then(r => {
+        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
+        return r.json();
+      })
+      .then(list => Array.isArray(list) ? list : []);
+  }
+  return satNvidiaGPUsPromise;
+}
+function satSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function satUpdateGPUSelectionNote() {
+  const note = document.getElementById('sat-gpu-selection-note');
+  if (!note) return;
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
+    return;
+  }
+  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
+}
+function satRenderGPUList(gpus) {
+  const root = document.getElementById('sat-gpu-list');
+  if (!root) return;
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    satUpdateGPUSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="sat-gpu-row">'
+      + '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  satUpdateGPUSelectionNote();
+}
+function satSelectAllGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
+  satUpdateGPUSelectionNote();
+}
+function satSelectNoGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
+  satUpdateGPUSelectionNote();
+}
+function satLoadGPUs() {
+  loadSatNvidiaGPUs().then(function(gpus) {
+    satRenderGPUList(gpus);
+  }).catch(function(err) {
+    const root = document.getElementById('sat-gpu-list');
+    if (root) {
+      root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    }
+    satUpdateGPUSelectionNote();
+  });
+}
+function satGPUDisplayName(gpu) {
+  const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
+  const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
+  return 'GPU ' + idx + ' — ' + name;
+}
+function satRequestBody(target, overrides) {
+  const body = {};
+  const labels = satLabels();
+  body.display_name = labels[target] || ('Validate ' + target);
+  body.stress_mode = satStressMode();
+  if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
+  if (overrides) {
+    Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
+  }
+  return body;
+}
+function enqueueSATTarget(target, overrides) {
+  return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
+    .then(r => r.json());
+}
+function streamSATTask(taskId, title, resetTerminal) {
+  if (satES) { satES.close(); satES = null; }
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  const term = document.getElementById('sat-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
+  return new Promise(function(resolve) {
+    satES = new EventSource('/api/tasks/' + taskId + '/stream');
+    satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    satES.addEventListener('done', function(e) {
+      satES.close();
+      satES = null;
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: !e.data, error: e.data || ''});
+    });
+    satES.onerror = function() {
+      if (satES) {
+        satES.close();
+        satES = null;
+      }
+      term.textContent += '\nERROR: stream disconnected.\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: false, error: 'stream disconnected'});
+    };
+  });
+}
+function selectedAMDValidateTargets() {
+  const targets = [];
+  const gpu = document.getElementById('sat-amd-target');
+  const mem = document.getElementById('sat-amd-mem-target');
+  const bw = document.getElementById('sat-amd-bandwidth-target');
+  if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
+  if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
+  if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
+  return targets;
+}
+function runSAT(target) {
+  return runSATWithOverrides(target, null);
+}
+function runSATWithOverrides(target, overrides) {
+  const title = (overrides && overrides.display_name) || target;
+  const term = document.getElementById('sat-terminal');
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  term.textContent = 'Enqueuing ' + title + ' test...\n';
+  return enqueueSATTarget(target, overrides)
+    .then(d => streamSATTask(d.task_id, title, false));
+}
+const nvidiaPerGPUTargets = [];
+const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+function satAllGPUIndicesForMulti() {
+  return Promise.resolve(satSelectedGPUIndices());
+}
+function expandSATTarget(target) {
+  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
+    return satAllGPUIndicesForMulti().then(function(indices) {
+      if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
+      return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
+    });
+  }
+  if (nvidiaPerGPUTargets.indexOf(target) < 0) {
+    return Promise.resolve([{target: target}]);
+  }
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
+  }
+  return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
+    target: target,
+    overrides: {
+      gpu_indices: [Number(gpu.index)],
+      display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
+    },
+    label: satGPUDisplayName(gpu),
+  })));
+}
+function runNvidiaFabricValidate(target) {
+  satAllGPUIndicesForMulti().then(function(indices) {
+    if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
+    runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
+  });
+}
+function runNvidiaValidateSet(target) {
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
+  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
+}
+function runAMDValidateSet() {
+  const targets = selectedAMDValidateTargets();
+  if (!targets.length) return;
+  if (targets.length === 1) return runSAT(targets[0]);
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— amd';
+  const term = document.getElementById('sat-terminal');
+  term.textContent = 'Running AMD validate set one by one...\n';
+  const labels = satLabels();
+  const runNext = (idx) => {
+    if (idx >= targets.length) return Promise.resolve();
+    const target = targets[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
+    return enqueueSATTarget(target)
+      .then(d => {
+        return streamSATTask(d.task_id, labels[target], false);
+      }).then(function() {
+        return runNext(idx + 1);
+      });
+  };
+  return runNext(0);
+}
+function runAllSAT() {
+  const cycles = 1;
+  const status = document.getElementById('sat-all-status');
+  status.textContent = 'Enqueuing...';
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const activeTargets = baseTargets.filter(target => {
+    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
+    const btn = document.getElementById('sat-btn-' + target);
+    return !(btn && btn.disabled);
+  });
+  Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
+    const expanded = [];
+    for (let cycle = 0; cycle < cycles; cycle++) {
+      groups.forEach(group => group.forEach(item => expanded.push(item)));
+    }
+    const total = expanded.length;
+    let enqueued = 0;
+    if (!total) {
+      status.textContent = 'No tasks selected.';
+      return;
+    }
+    const runNext = (idx) => {
+      if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
+      const item = expanded[idx];
+      status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
+      return enqueueSATTarget(item.target, item.overrides)
+        .then(() => {
+          enqueued++;
+          return runNext(idx + 1);
+        });
+    };
+    return runNext(0);
+  }).catch(err => {
+    status.textContent = 'Error: ' + err.message;
+  });
+}
+</script>
+<script>
+fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
+    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
+    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
+    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
+});
+satLoadGPUs();
+function disableSATAMDOptions(reason) {
+    ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
+        const cb = document.getElementById(id);
+        if (!cb) return;
+        cb.disabled = true;
+        cb.checked = false;
+        cb.title = reason;
+    });
+}
+function disableSATCard(id, reason) {
+    const btn = document.getElementById('sat-btn-' + id);
+    if (!btn) return;
+    btn.disabled = true;
+    btn.title = reason;
+    btn.style.opacity = '0.4';
+    const card = btn.closest('.card');
+    if (card) {
+        let note = card.querySelector('.sat-unavail');
+        if (!note) {
+            note = document.createElement('p');
+            note.className = 'sat-unavail';
+            note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
+            const body = card.querySelector('.card-body');
+            if (body) body.insertBefore(note, body.firstChild);
+        }
+        note.textContent = reason;
+    }
+}
+</script>`
+}
+
+func loadValidateInventory(opts HandlerOptions) validateInventory {
+	unknown := "Audit snapshot not loaded."
+	out := validateInventory{
+		CPU:     unknown,
+		Memory:  unknown,
+		Storage: unknown,
+		NVIDIA:  unknown,
+		AMD:     unknown,
+	}
+	data, err := loadSnapshot(opts.AuditPath)
+	if err != nil {
+		return out
+	}
+	var snap schema.HardwareIngestRequest
+	if err := json.Unmarshal(data, &snap); err != nil {
+		return out
+	}
+
+	cpuCounts := map[string]int{}
+	cpuTotal := 0
+	for _, cpu := range snap.Hardware.CPUs {
+		if cpu.Present != nil && !*cpu.Present {
+			continue
+		}
+		cpuTotal++
+		addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
+	}
+
+	memCounts := map[string]int{}
+	memTotal := 0
+	for _, dimm := range snap.Hardware.Memory {
+		if dimm.Present != nil && !*dimm.Present {
+			continue
+		}
+		memTotal++
+		addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
+	}
+
+	storageCounts := map[string]int{}
+	storageTotal := 0
+	for _, dev := range snap.Hardware.Storage {
+		if dev.Present != nil && !*dev.Present {
+			continue
+		}
+		storageTotal++
+		addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+	}
+
+	nvidiaCounts := map[string]int{}
+	nvidiaTotal := 0
+	amdCounts := map[string]int{}
+	amdTotal := 0
+	for _, dev := range snap.Hardware.PCIeDevices {
+		if dev.Present != nil && !*dev.Present {
+			continue
+		}
+		if validateIsVendorGPU(dev, "nvidia") {
+			nvidiaTotal++
+			addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+		}
+		if validateIsVendorGPU(dev, "amd") {
+			amdTotal++
+			addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+		}
+	}
+
+	out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
+	out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
+	out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
+	out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
+	out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
+	out.NvidiaGPUCount = nvidiaTotal
+	out.AMDGPUCount = amdTotal
+	return out
+}
+
+func renderValidateCardBody(devices, description, commands, settings string) string {
+	return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
+}
+
+func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
+	if total == 0 {
+		return "0 " + unit + "s detected."
+	}
+	keys := make([]string, 0, len(models))
+	for key := range models {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	parts := make([]string, 0, len(keys))
+	for _, key := range keys {
+		parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
+	}
+	label := unit
+	if total != 1 {
+		label += "s"
+	}
+	if len(parts) == 1 {
+		return parts[0] + " " + label
+	}
+	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
+}
+
+func addValidateModel(counts map[string]int, name string) {
+	name = strings.TrimSpace(name)
+	if name == "" {
+		name = "unknown"
+	}
+	counts[name]++
+}
+
+func validateTrimPtr(value *string) string {
+	if value == nil {
+		return ""
+	}
+	return strings.TrimSpace(*value)
+}
+
+func validateFirstNonEmpty(values ...string) string {
+	for _, value := range values {
+		value = strings.TrimSpace(value)
+		if value != "" {
+			return value
+		}
+	}
+	return ""
+}
+
+func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
+	model := strings.ToLower(validateTrimPtr(dev.Model))
+	manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
+	class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
+	if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
+		return false
+	}
+	switch vendor {
+	case "nvidia":
+		return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
+	case "amd":
+		isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
+		isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
+		isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
+		return isGPUClass && (isAMDVendor || isAMDModel)
+	default:
+		return false
+	}
+}
+
+func renderSATCard(id, label, runAction, headerActions, body string) string {
+	actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
+	if strings.TrimSpace(headerActions) != "" {
+		actions += headerActions
+	}
+	return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
+		label, actions, body)
+}
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
 // At metricsCollectInterval = 5 s this covers 30 minutes of live history.
 const metricsChartWindow = 360

+// metricsDownsampleAge is the age after which old metrics rows are downsampled
+// to 1 sample per minute. Data fresher than this is kept at full resolution.
+const metricsDownsampleAge = 2 * time.Hour
+
+// metricsRetainWindow is the total retention period for metrics rows.
+// Rows older than this are deleted entirely by the background compactor.
+const metricsRetainWindow = 48 * time.Hour
+
 var metricsCollectInterval = 5 * time.Second

 // pendingNetChange tracks a network state change awaiting confirmation.
@@ -263,6 +271,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
+	mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
+	mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
+	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)

 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -290,8 +301,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Export
 	mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
 	mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
-	mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
-	mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
+	mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
+	mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
+	mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)

 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
@@ -334,13 +346,24 @@ func (h *handler) startMetricsCollector() {
 	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
-		for range ticker.C {
-			sample := platform.SampleLiveMetrics()
-			if h.metricsDB != nil {
-				_ = h.metricsDB.Write(sample)
+		pruneTicker := time.NewTicker(time.Hour)
+		defer pruneTicker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				sample := platform.SampleLiveMetrics()
+				if h.metricsDB != nil {
+					_ = h.metricsDB.Write(sample)
+				}
+				h.feedRings(sample)
+				h.setLatestMetric(sample)
+			case <-pruneTicker.C:
+				if h.metricsDB != nil {
+					now := time.Now().UTC()
+					_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
+					_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
+				}
 			}
-			h.feedRings(sample)
-			h.setLatestMetric(sample)
 		}
 	})
 }
@@ -574,12 +597,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	}
 	timeline := metricsTimelineSegments(samples, time.Now())
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
-		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		var overviewOk bool
+		var buf []byte
+		buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
-		if !ok {
+		if !overviewOk {
 			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 			return
 		}
@@ -588,23 +613,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		_, _ = w.Write(buf)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}

-	buf, err := renderMetricChartSVG(
-		title,
-		labels,
-		sampleTimes(samples),
-		datasets,
-		names,
-		yMin,
-		yMax,
-		chartCanvasHeightForPath(path, len(names)),
-		timeline,
-	)
+	var buf []byte
+	if stacked {
+		buf, err = renderStackedMetricChartSVG(
+			title,
+			labels,
+			sampleTimes(samples),
+			datasets,
+			names,
+			yMax,
+			chartCanvasHeightForPath(path, len(names)),
+			timeline,
+		)
+	} else {
+		buf, err = renderMetricChartSVG(
+			title,
+			labels,
+			sampleTimes(samples),
+			datasets,
+			names,
+			yMin,
+			yMax,
+			chartCanvasHeightForPath(path, len(names)),
+			timeline,
+		)
+	}
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -614,12 +653,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }

-func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
-	var datasets [][]float64
-	var names []string
-	var title string
-	var yMin, yMax *float64
-	labels := sampleTimeLabels(samples)
+func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
+	labels = sampleTimeLabels(samples)

 	switch {
 	case path == "server-load":
@@ -656,12 +691,19 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case path == "server-power":
 		title = "System Power"
 		power := make([]float64, len(samples))
+		label := "Power W"
 		for i, s := range samples {
 			power[i] = s.PowerW
+			if strings.TrimSpace(s.PowerSource) != "" {
+				label = fmt.Sprintf("Power W · %s", s.PowerSource)
+				if strings.TrimSpace(s.PowerMode) != "" {
+					label += fmt.Sprintf(" (%s)", s.PowerMode)
+				}
+			}
 		}
 		power = normalizePowerSeries(power)
 		datasets = [][]float64{power}
-		names = []string{"Power W"}
+		names = []string{label}
 		yMin = floatPtr(0)
 		yMax = autoMax120(power)

@@ -706,7 +748,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case strings.HasPrefix(path, "gpu/"):
 		idx, sub, ok := parseGPUChartPath(path)
 		if !ok {
-			return nil, nil, nil, "", nil, nil, false
+			return nil, nil, nil, "", nil, nil, false, false
 		}
 		switch sub {
 		case "load":
@@ -714,7 +756,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
 			names = []string{"Load %", "Mem %"}
@@ -724,7 +766,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{temp}
 			names = []string{"Temp °C"}
@@ -734,7 +776,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Core Clock MHz"}
@@ -743,7 +785,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Memory Clock MHz"}
@@ -752,7 +794,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{power}
 			names = []string{"Power W"}
@@ -760,10 +802,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		}

 	default:
-		return nil, nil, nil, "", nil, nil, false
+		return nil, nil, nil, "", nil, nil, false, false
 	}

-	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
+	return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
 }

 func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
@@ -929,6 +971,37 @@ func normalizePowerSeries(ds []float64) []float64 {
 	return out
 }

+// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
+func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
+	seen := map[int]struct{}{}
+	for _, s := range samples {
+		for _, p := range s.PSUs {
+			seen[p.Slot] = struct{}{}
+		}
+	}
+	slots := make([]int, 0, len(seen))
+	for s := range seen {
+		slots = append(slots, s)
+	}
+	sort.Ints(slots)
+	return slots
+}
+
+// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
+func psuStackedTotal(datasets [][]float64) []float64 {
+	if len(datasets) == 0 {
+		return nil
+	}
+	n := len(datasets[0])
+	total := make([]float64, n)
+	for _, ds := range datasets {
+		for i, v := range ds {
+			total[i] += v
+		}
+	}
+	return total
+}
+
 func normalizeFanSeries(ds []float64) []float64 {
 	if len(ds) == 0 {
 		return nil
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 		},
 	}

-	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 		},
 	}

-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 		},
 	}

-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
@@ -420,6 +420,49 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
 	}
 }

+func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp: start,
+			PSUs: []platform.PSUReading{
+				{Slot: 1, PowerW: 120},
+				{Slot: 2, PowerW: 130},
+			},
+			PowerW:      250,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
+		},
+		{
+			Timestamp: start.Add(time.Minute),
+			PSUs: []platform.PSUReading{
+				{Slot: 1, PowerW: 140},
+				{Slot: 2, PowerW: 135},
+			},
+			PowerW:      275,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
+		},
+	}
+
+	datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
+	if !ok {
+		t.Fatal("expected server-power chart data")
+	}
+	if title != "System Power" {
+		t.Fatalf("title=%q", title)
+	}
+	if stacked {
+		t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
+	}
+	if len(datasets) != 1 || len(names) != 1 {
+		t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
+	}
+	if names[0] != "Power W · sdr_psu_input (autotuned)" {
+		t.Fatalf("names=%v", names)
+	}
+}
+
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -628,11 +671,11 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
 	}
-	if !strings.Contains(body, `Export to USB`) {
-		t.Fatalf("tools page missing export to usb section: %s", body)
+	if !strings.Contains(body, `USB Black-Box`) {
+		t.Fatalf("tools page missing usb black-box section: %s", body)
 	}
-	if !strings.Contains(body, `Support Bundle</button>`) {
-		t.Fatalf("tools page missing support bundle usb button: %s", body)
+	if !strings.Contains(body, `/api/blackbox/status`) {
+		t.Fatalf("tools page missing black-box status api usage: %s", body)
 	}
 }

@@ -650,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 		`/api/gpu/nvidia`,
 		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
+		`/api/bee-bench/nvidia/autotune/run`,
+		`/api/bee-bench/nvidia/autotune/status`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
+		`Autotune`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -744,6 +790,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	}
 }

+func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`NVIDIA Interconnect (NCCL)`,
+		`Validate and Stress:`,
+		`NVIDIA Bandwidth (NVBandwidth)`,
+		`nvbandwidth runs all built-in tests without a time limit`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("validate page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
@@ -7,14 +7,43 @@ import (
 	"time"
 )

+const (
+	recoverLoopMaxDelay   = 60 * time.Second
+	recoverLoopResetAfter = 30 * time.Second
+)
+
+// goRecoverLoop starts fn in a goroutine, restarting after panics.
+// restartDelay is the initial delay; successive panics double it up to
+// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
+// successfully for recoverLoopResetAfter without panicking.
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
+		delay := restartDelay
+		consecutive := 0
 		for {
-			if !runRecoverable(name, fn) {
+			start := time.Now()
+			panicked := runRecoverable(name, fn)
+			if !panicked {
 				return
 			}
-			if restartDelay > 0 {
-				time.Sleep(restartDelay)
+			consecutive++
+			if time.Since(start) >= recoverLoopResetAfter {
+				delay = restartDelay
+				consecutive = 1
+			}
+			slog.Warn("goroutine restarting after panic",
+				"component", name,
+				"consecutive_panics", consecutive,
+				"next_delay", delay,
+			)
+			if delay > 0 {
+				time.Sleep(delay)
+			}
+			if delay < recoverLoopMaxDelay {
+				delay *= 2
+				if delay > recoverLoopMaxDelay {
+					delay = recoverLoopMaxDelay
+				}
 			}
 		}
 	}()
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
 		}
 		return gpuDisplayLabel(idx) + " Overview", buf, true
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
 	}
-	buf, err := renderMetricChartSVG(
-		title,
-		labels,
-		sampleTimes(samples),
-		datasets,
-		names,
-		yMin,
-		yMax,
-		chartCanvasHeightForPath(path, len(names)),
-		timeline,
-	)
+	var buf []byte
+	var err error
+	if stacked {
+		buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
+	} else {
+		buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
+	}
 	if err != nil {
 		return "", nil, false
 	}
@@ -0,0 +1,505 @@
+package webui
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+	"bee/audit/internal/runtimeenv"
+)
+
+type taskRunnerState struct {
+	PID       int       `json:"pid"`
+	Status    string    `json:"status"`
+	Error     string    `json:"error,omitempty"`
+	UpdatedAt time.Time `json:"updated_at"`
+}
+
+func taskRunnerStatePath(t *Task) string {
+	if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
+		return ""
+	}
+	return filepath.Join(t.ArtifactsDir, "runner-state.json")
+}
+
+func writeTaskRunnerState(t *Task, state taskRunnerState) error {
+	path := taskRunnerStatePath(t)
+	if path == "" {
+		return nil
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(state, "", "  ")
+	if err != nil {
+		return err
+	}
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmp, path)
+}
+
+func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
+	path := taskRunnerStatePath(t)
+	if path == "" {
+		return taskRunnerState{}, false
+	}
+	data, err := os.ReadFile(path)
+	if err != nil || len(data) == 0 {
+		return taskRunnerState{}, false
+	}
+	var state taskRunnerState
+	if err := json.Unmarshal(data, &state); err != nil {
+		return taskRunnerState{}, false
+	}
+	return state, true
+}
+
+func processAlive(pid int) bool {
+	if pid <= 0 {
+		return false
+	}
+	err := syscall.Kill(pid, 0)
+	return err == nil || err == syscall.EPERM
+}
+
+func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
+	now := time.Now()
+	t.DoneAt = &now
+	switch {
+	case cancelled:
+		t.Status = TaskCancelled
+		t.ErrMsg = "aborted"
+	case strings.TrimSpace(errMsg) != "":
+		t.Status = TaskFailed
+		t.ErrMsg = errMsg
+	default:
+		t.Status = TaskDone
+		t.ErrMsg = ""
+	}
+}
+
+func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
+	if opts == nil {
+		j.append("ERROR: handler options not configured")
+		j.finish("handler options not configured")
+		return
+	}
+	a := opts.App
+
+	recovered := len(j.lines) > 0
+	j.append(fmt.Sprintf("Starting %s...", t.Name))
+	if recovered {
+		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
+	}
+
+	var (
+		archive string
+		err     error
+	)
+
+	switch t.Target {
+	case "nvidia":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		diagLevel := 2
+		if t.params.StressMode {
+			diagLevel = 3
+		}
+		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
+			result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
+			if e != nil {
+				err = e
+			} else {
+				archive = result.Body
+			}
+		} else {
+			archive, err = a.RunNvidiaAcceptancePack("", j.append)
+		}
+	case "nvidia-targeted-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if dur <= 0 {
+			dur = 300
+		}
+		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-bench-perf":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			SizeMB:            t.params.SizeMB,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RunNCCL:           t.params.RunNCCL,
+			ParallelGPUs:      t.params.ParallelGPUs,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
+		}, j.append)
+	case "nvidia-bench-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
+		}, j.append)
+	case "nvidia-bench-autotune":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
+			Profile: t.params.BenchmarkProfile,
+			SizeMB:  t.params.SizeMB,
+		}, t.params.BenchmarkKind, j.append)
+	case "nvidia-compute":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
+		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
+	case "nvidia-targeted-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-pulse":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-bandwidth":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
+	case "nvidia-interconnect":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
+	case "nvidia-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
+		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
+			DurationSec:       dur,
+			Loader:            t.params.Loader,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			StaggerSeconds:    rampPlan.StaggerSeconds,
+		}, j.append)
+	case "memory":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
+		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
+	case "storage":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
+	case "cpu":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		if dur <= 0 {
+			if t.params.StressMode {
+				dur = 1800
+			} else {
+				dur = 60
+			}
+		}
+		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
+		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
+	case "amd":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
+	case "amd-mem":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
+	case "amd-bandwidth":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
+	case "amd-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
+	case "memory-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
+	case "sat-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
+	case "platform-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
+		runOpts.Components = t.params.PlatformComponents
+		archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
+	case "audit":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		result, e := a.RunAuditNow(opts.RuntimeMode)
+		if e != nil {
+			err = e
+		} else {
+			for _, line := range splitLines(result.Body) {
+				j.append(line)
+			}
+		}
+	case "support-bundle":
+		j.append("Building support bundle...")
+		archive, err = buildSupportBundle(opts.ExportDir)
+	case "install":
+		if strings.TrimSpace(t.params.Device) == "" {
+			err = fmt.Errorf("device is required")
+			break
+		}
+		installLogPath := platform.InstallLogPath(t.params.Device)
+		j.append("Install log: " + installLogPath)
+		err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
+	case "install-to-ram":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		err = a.RunInstallToRAM(ctx, j.append)
+	default:
+		j.append("ERROR: unknown target: " + t.Target)
+		j.finish("unknown target")
+		return
+	}
+
+	if archive != "" {
+		archivePath := app.ExtractArchivePath(archive)
+		if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
+			err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
+		}
+		if opts.App != nil && opts.App.StatusDB != nil {
+			app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
+		}
+	}
+
+	if err != nil {
+		if ctx.Err() != nil {
+			j.append("Aborted.")
+			j.finish("aborted")
+		} else {
+			j.append("ERROR: " + err.Error())
+			j.finish(err.Error())
+		}
+		return
+	}
+	if archive != "" {
+		j.append("Archive: " + archive)
+	}
+	j.finish("")
+}
+
+func loadPersistedTask(statePath, taskID string) (*Task, error) {
+	data, err := os.ReadFile(statePath)
+	if err != nil {
+		return nil, err
+	}
+	var persisted []persistedTask
+	if err := json.Unmarshal(data, &persisted); err != nil {
+		return nil, err
+	}
+	for _, pt := range persisted {
+		if pt.ID != taskID {
+			continue
+		}
+		t := &Task{
+			ID:             pt.ID,
+			Name:           pt.Name,
+			Target:         pt.Target,
+			Priority:       pt.Priority,
+			Status:         pt.Status,
+			CreatedAt:      pt.CreatedAt,
+			StartedAt:      pt.StartedAt,
+			DoneAt:         pt.DoneAt,
+			ErrMsg:         pt.ErrMsg,
+			LogPath:        pt.LogPath,
+			ArtifactsDir:   pt.ArtifactsDir,
+			ReportJSONPath: pt.ReportJSONPath,
+			ReportHTMLPath: pt.ReportHTMLPath,
+			params:         pt.Params,
+		}
+		ensureTaskReportPaths(t)
+		return t, nil
+	}
+	return nil, fmt.Errorf("task %s not found", taskID)
+}
+
+func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
+	if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
+		fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
+		return 2
+	}
+
+	runtimeInfo, err := runtimeenv.Detect("auto")
+	if err != nil {
+		slog.Warn("resolve runtime for task-run", "err", err)
+	}
+	opts := &HandlerOptions{
+		ExportDir:   exportDir,
+		App:         app.New(platform.New()),
+		RuntimeMode: runtimeInfo.Mode,
+	}
+	statePath := filepath.Join(exportDir, "tasks-state.json")
+	task, err := loadPersistedTask(statePath, taskID)
+	if err != nil {
+		fmt.Fprintln(stderr, err.Error())
+		return 1
+	}
+	if task.StartedAt == nil || task.StartedAt.IsZero() {
+		now := time.Now()
+		task.StartedAt = &now
+	}
+	if task.Status == "" {
+		task.Status = TaskRunning
+	}
+	if err := writeTaskRunnerState(task, taskRunnerState{
+		PID:       os.Getpid(),
+		Status:    TaskRunning,
+		UpdatedAt: time.Now().UTC(),
+	}); err != nil {
+		fmt.Fprintln(stderr, err.Error())
+		return 1
+	}
+
+	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
+	defer cancel()
+
+	j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
+	executeTaskWithOptions(opts, task, j, ctx)
+	finalizeTaskForResult(task, j.err, ctx.Err() != nil)
+	if err := writeTaskReportArtifacts(task); err != nil {
+		appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
+	}
+	j.closeLog()
+	if err := writeTaskRunnerState(task, taskRunnerState{
+		PID:       os.Getpid(),
+		Status:    task.Status,
+		Error:     task.ErrMsg,
+		UpdatedAt: time.Now().UTC(),
+	}); err != nil {
+		fmt.Fprintln(stderr, err.Error())
+	}
+	if task.ErrMsg != "" {
+		return 1
+	}
+	return 0
+}
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"io"
 	"log/slog"
 	"net/http"
 	"os"
@@ -13,6 +14,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"syscall"
 	"time"

 	"bee/audit/internal/app"
@@ -34,6 +36,7 @@ var taskNames = map[string]string{
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
 	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
 	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
+	"nvidia-bench-autotune":  "NVIDIA Bee Bench Power Source Autotune",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -109,8 +112,9 @@ type Task struct {
 	ReportHTMLPath string     `json:"report_html_path,omitempty"`

 	// runtime fields (not serialised)
-	job    *jobState
-	params taskParams
+	job       *jobState
+	runnerPID int
+	params    taskParams
 }

 // taskParams holds optional parameters parsed from the run request.
@@ -125,6 +129,7 @@ type taskParams struct {
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
+	BenchmarkKind      string   `json:"benchmark_kind,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	RampStep           int      `json:"ramp_step,omitempty"`
@@ -162,6 +167,32 @@ type nvidiaRampSpec struct {
 	TotalDurationSec int
 }

+func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
+	switch strings.TrimSpace(strings.ToLower(profile)) {
+	case "overnight":
+		return 1024, 2
+	case "acceptance":
+		return 1024, 1
+	case "smoke":
+		return 256, 1
+	}
+	if stress {
+		return 512, 1
+	}
+	return 256, 1
+}
+
+func taskMayLeaveOrphanWorkers(target string) bool {
+	switch strings.TrimSpace(strings.ToLower(target)) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
+		"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
+		return true
+	default:
+		return false
+	}
+}
+
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -300,6 +331,13 @@ var (
 	installCommand     = func(ctx context.Context, device string, logPath string) *exec.Cmd {
 		return exec.CommandContext(ctx, "bee-install", device, logPath)
 	}
+	externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
+		exe, err := os.Executable()
+		if err != nil {
+			return nil, err
+		}
+		return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
+	}
 )

 // enqueue adds a task to the queue and notifies the worker.
@@ -337,6 +375,11 @@ func (q *taskQueue) prune() {

 // nextPending returns the highest-priority pending task (nil if none).
 func (q *taskQueue) nextPending() *Task {
+	for _, t := range q.tasks {
+		if t.Status == TaskRunning {
+			return nil
+		}
+	}
 	var best *Task
 	for _, t := range q.tasks {
 		if t.Status != TaskPending {
@@ -456,6 +499,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 	if !q.started {
 		q.loadLocked()
 		q.started = true
+		q.resumeRunningTasksLocked()
 		goRecoverLoop("task worker", 2*time.Second, q.worker)
 	}
 	hasPending := q.nextPending() != nil
@@ -489,15 +533,12 @@ func (q *taskQueue) worker() {
 				t.StartedAt = &now
 				t.DoneAt = nil
 				t.ErrMsg = ""
-				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
+				j := newTaskJobState(t.LogPath)
 				t.job = j
 				q.persistLocked()
 				q.mu.Unlock()

-				taskCtx, taskCancel := context.WithCancel(context.Background())
-				j.cancel = taskCancel
-				q.executeTask(t, j, taskCtx)
-				taskCancel()
+				q.runTaskExternal(t, j)

 				q.mu.Lock()
 				q.prune()
@@ -509,6 +550,207 @@ func (q *taskQueue) worker() {
 	}
 }

+func (q *taskQueue) resumeRunningTasksLocked() {
+	for _, t := range q.tasks {
+		if t.Status != TaskRunning {
+			continue
+		}
+		if t.job == nil {
+			t.job = newTaskJobState(t.LogPath)
+		}
+		q.attachExternalTaskControlsLocked(t, t.job)
+		q.startRecoveredTaskMonitorLocked(t, t.job)
+	}
+}
+
+func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
+	if t == nil || j == nil {
+		return
+	}
+	j.cancel = func() {
+		pid := t.runnerPID
+		if pid <= 0 {
+			if state, ok := readTaskRunnerState(t); ok {
+				pid = state.PID
+			}
+		}
+		if pid > 0 {
+			_ = syscall.Kill(pid, syscall.SIGTERM)
+		}
+	}
+}
+
+func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
+	if t == nil || j == nil || t.runnerPID <= 0 {
+		return
+	}
+	goRecoverOnce("task runner monitor", func() {
+		stopTail := make(chan struct{})
+		doneTail := make(chan struct{})
+		go q.followTaskLog(t, j, stopTail, doneTail)
+		for processAlive(t.runnerPID) {
+			time.Sleep(500 * time.Millisecond)
+		}
+		close(stopTail)
+		<-doneTail
+		q.finishExternalTask(t, j, nil)
+	})
+}
+
+func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
+	stopTail := make(chan struct{})
+	doneTail := make(chan struct{})
+	defer func() {
+		close(stopTail)
+		<-doneTail
+	}()
+	go q.followTaskLog(t, j, stopTail, doneTail)
+
+	cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
+	if err != nil {
+		j.appendFromLog("ERROR: " + err.Error())
+		q.finishExternalTask(t, j, err)
+		return
+	}
+	if err := cmd.Start(); err != nil {
+		j.appendFromLog("ERROR: " + err.Error())
+		q.finishExternalTask(t, j, err)
+		return
+	}
+
+	q.mu.Lock()
+	t.runnerPID = cmd.Process.Pid
+	q.attachExternalTaskControlsLocked(t, j)
+	q.persistLocked()
+	q.mu.Unlock()
+
+	waitErr := cmd.Wait()
+	time.Sleep(200 * time.Millisecond)
+	q.finishExternalTask(t, j, waitErr)
+}
+
+func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
+	defer close(done)
+	path := ""
+	if t != nil {
+		path = t.LogPath
+	}
+	if strings.TrimSpace(path) == "" {
+		return
+	}
+	offset := int64(0)
+	if info, err := os.Stat(path); err == nil {
+		offset = info.Size()
+	}
+	var partial string
+	ticker := time.NewTicker(250 * time.Millisecond)
+	defer ticker.Stop()
+	flush := func() {
+		data, newOffset, err := readTaskLogDelta(path, offset)
+		if err != nil || len(data) == 0 {
+			offset = newOffset
+			return
+		}
+		offset = newOffset
+		text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
+		lines := strings.Split(text, "\n")
+		partial = lines[len(lines)-1]
+		for _, line := range lines[:len(lines)-1] {
+			if line == "" {
+				continue
+			}
+			j.appendFromLog(line)
+		}
+	}
+	for {
+		select {
+		case <-ticker.C:
+			flush()
+		case <-stop:
+			flush()
+			if strings.TrimSpace(partial) != "" {
+				j.appendFromLog(partial)
+			}
+			return
+		}
+	}
+}
+
+func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, offset, err
+	}
+	defer f.Close()
+	info, err := f.Stat()
+	if err != nil {
+		return nil, offset, err
+	}
+	if info.Size() < offset {
+		offset = 0
+	}
+	if _, err := f.Seek(offset, io.SeekStart); err != nil {
+		return nil, offset, err
+	}
+	data, err := io.ReadAll(io.LimitReader(f, 1<<20))
+	return data, offset + int64(len(data)), err
+}
+
+func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
+		if j != nil && !j.isDone() {
+			j.finish(t.ErrMsg)
+			j.closeLog()
+		}
+		select {
+		case q.trigger <- struct{}{}:
+		default:
+		}
+		return
+	}
+
+	state, ok := readTaskRunnerState(t)
+	switch {
+	case ok && state.Status != TaskRunning:
+		t.Status = state.Status
+		t.ErrMsg = state.Error
+		now := state.UpdatedAt
+		if now.IsZero() {
+			now = time.Now()
+		}
+		t.DoneAt = &now
+	case waitErr != nil:
+		now := time.Now()
+		t.Status = TaskFailed
+		t.ErrMsg = waitErr.Error()
+		t.DoneAt = &now
+	default:
+		now := time.Now()
+		t.Status = TaskFailed
+		t.ErrMsg = "task runner exited without final state"
+		t.DoneAt = &now
+	}
+	t.runnerPID = 0
+	q.finalizeTaskArtifactPathsLocked(t)
+	q.persistLocked()
+
+	if j != nil && !j.isDone() {
+		j.finish(t.ErrMsg)
+		j.closeLog()
+	}
+	if t.ErrMsg != "" {
+		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
+	} else {
+		taskSerialEvent(t, "finished with status="+t.Status)
+	}
+	select {
+	case q.trigger <- struct{}{}:
+	default:
+	}
+}
+
 func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
 	startedKmsgWatch := false
 	defer q.finalizeTaskRun(t, j)
@@ -559,6 +801,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	if err := writeTaskReportArtifacts(t); err != nil {
 		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
 	}
+	j.closeLog()
 	if t.ErrMsg != "" {
 		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
 		return
@@ -587,8 +830,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	}
 	a := q.opts.App

+	recovered := len(j.lines) > 0
 	j.append(fmt.Sprintf("Starting %s...", t.Name))
-	if len(j.lines) > 0 {
+	if recovered {
 		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
 	}

@@ -658,6 +902,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
+	case "nvidia-bench-autotune":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
+			Profile: t.params.BenchmarkProfile,
+			SizeMB:  t.params.SizeMB,
+		}, t.params.BenchmarkKind, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -710,15 +963,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
-		if t.params.BurnProfile != "" && dur <= 0 {
-			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
-		}
-		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
-			DurationSec: dur,
-			Loader:      platform.NvidiaStressLoaderNCCL,
-			GPUIndices:  t.params.GPUIndices,
-		}, j.append)
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -751,10 +996,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		sizeMB, passes := 256, 1
-		if t.params.StressMode {
-			sizeMB, passes = 1024, 3
-		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
@@ -956,15 +1199,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	case TaskRunning:
-		if t.job != nil {
-			t.job.abort()
+		if t.job == nil || !t.job.abort() {
+			writeError(w, http.StatusConflict, "task is not cancellable")
+			return
 		}
-		t.Status = TaskCancelled
-		now := time.Now()
-		t.DoneAt = &now
-		globalQueue.persistLocked()
-		taskSerialEvent(t, "finished with status="+t.Status)
-		writeJSON(w, map[string]string{"status": "cancelled"})
+		writeJSON(w, map[string]string{"status": "aborting"})
 	default:
 		writeError(w, http.StatusConflict, "task is not running or pending")
 	}
@@ -1010,9 +1249,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			if t.job != nil {
 				t.job.abort()
 			}
-			t.Status = TaskCancelled
-			t.DoneAt = &now
-			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		}
 	}
@@ -1037,6 +1273,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1140,15 +1379,29 @@ func (q *taskQueue) loadLocked() {
 		}
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
-			// The task was interrupted by a bee-web restart. Child processes
-			// (e.g. bee-gpu-burn-worker) survive the restart in their own
-			// process groups and cannot be cancelled retroactively. Mark the
-			// task as failed so the user can decide whether to re-run it
-			// rather than blindly re-launching duplicate workers.
-			now := time.Now()
-			t.Status = TaskFailed
-			t.DoneAt = &now
-			t.ErrMsg = "interrupted by bee-web restart"
+			state, ok := readTaskRunnerState(t)
+			switch {
+			case ok && state.Status == TaskRunning && processAlive(state.PID):
+				t.runnerPID = state.PID
+				t.job = newTaskJobState(t.LogPath)
+			case ok && state.Status != TaskRunning:
+				t.runnerPID = state.PID
+				t.Status = state.Status
+				t.ErrMsg = state.Error
+				now := state.UpdatedAt
+				if now.IsZero() {
+					now = time.Now()
+				}
+				t.DoneAt = &now
+			default:
+				if taskMayLeaveOrphanWorkers(t.Target) {
+					_ = platform.KillTestWorkers()
+				}
+				now := time.Now()
+				t.Status = TaskFailed
+				t.DoneAt = &now
+				t.ErrMsg = "interrupted by bee-web restart"
+			}
 		} else if t.Status == TaskPending {
 			t.StartedAt = nil
 			t.DoneAt = nil
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
 	}
 }

+func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
+	var gotSizeMB, gotPasses int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "mem-validate-1",
+		Name:      "Memory SAT",
+		Target:    "memory",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{StressMode: true},
+	}
+	j := &jobState{}
+
+	orig := runMemoryAcceptancePackCtx
+	runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
+		gotSizeMB = sizeMB
+		gotPasses = passes
+		return "/tmp/memory-validate.tar.gz", nil
+	}
+	defer func() { runMemoryAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotSizeMB != 512 || gotPasses != 1 {
+		t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
+	}
+}
+
 func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
@@ -10,4 +10,4 @@ Generic engineering rules live in `bible/rules/patterns/`.
 | `architecture/system-overview.md` | What bee does, scope, tech stack |
 | `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
 | `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
-| `decisions/` | Architectural decision log |
+| `decisions/` | Architectural decision log, including read-only submodule policy |
@@ -58,6 +58,8 @@ Fills gaps where Redfish/logpile is blind:
 - `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
 - Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
 - Contract fields that have no honest local source on a generic Linux host may remain empty.
+- Embedded submodules such as `internal/chart/` and `bible/` are read-only for `bee` feature work.
+- If the UI needs extra information, `bee` must emit it through the standard audit JSON contract rather than patching `chart`.

 ## Tech stack

@@ -101,7 +103,7 @@ Fills gaps where Redfish/logpile is blind:
 | `iso/builder/` | ISO build scripts and `live-build` profile |
 | `iso/overlay/` | Source overlay copied into a staged build overlay |
 | `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
-| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
+| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web`; update by submodule pointer only, never by local `bee`-specific edits |
 | `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
 | `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
 | `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
@@ -0,0 +1,39 @@
+# Decision: Treat embedded submodules as read-only
+
+## Context
+
+`bee` embeds external git submodules such as:
+
+- `internal/chart/` — `reanimator/chart`, a generic read-only viewer for Reanimator JSON snapshots
+- `bible/` — shared engineering rules and contracts
+
+These repositories are reused by other projects. A local feature request in `bee`
+must not be solved by silently changing shared submodule behavior.
+
+The concrete failure mode here was attempting to add project-specific storage
+telemetry presentation by editing `internal/chart/`. That couples a shared viewer
+to one host application's needs and creates hidden cross-project regressions.
+
+## Decision
+
+Embedded submodules are read-only from the point of view of `bee`.
+
+- Do not implement `bee`-specific behavior by editing `internal/chart/`.
+- Do not implement `bee`-specific behavior by editing `bible/`.
+- If `bee` needs new data in the report, produce it in the standard audit JSON
+  emitted by `bee` itself.
+- `chart` must continue to consume the canonical snapshot as an external viewer,
+  without host-specific forks.
+- Updating a submodule pointer to an upstream commit is allowed.
+- Carrying local unmerged submodule commits as part of a `bee` feature is forbidden.
+
+## Consequences
+
+- Audit/report features must be expressed through the contract in
+  `bible-local/docs/hardware-ingest-contract.md`.
+- `bee` owns collection, normalization, and serialization of storage telemetry in
+  `hardware.storage[]`.
+- `chart` remains a pure visualization module that reads the snapshot it is given.
+- If a capability is genuinely missing in a shared submodule, it must be proposed
+  and landed upstream as a generic change first, then pulled into `bee` via a
+  normal submodule update.
@@ -6,3 +6,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
 |---|---|---|
 | 2026-03-05 | Use NVIDIA proprietary driver | active |
 | 2026-04-01 | Treat memtest as explicit ISO content | active |
+| 2026-04-29 | Treat embedded submodules as read-only | active |
@@ -110,8 +110,12 @@ nvidia-smi / lspci (audit collection)

 ---

-## What Needs Fixing
+## Fixed Issues

-1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
-2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
-3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
+All previously open items are resolved:
+
+1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
+2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
+3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
+4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
+5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
@@ -15,6 +15,41 @@ This applies to:
 - `iso/builder/config/package-lists/*.list.chroot`
 - Any package referenced in bootloader configs, hooks, or overlay scripts

+## Bootloader sync rule
+
+The ISO has two independent bootloader configs that must be kept in sync manually:
+
+| File | Used by |
+|------|---------|
+| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
+| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
+
+live-build does NOT derive one from the other. Any new boot entry, kernel parameter
+change, or new mode added to one file must be manually mirrored in the other.
+
+**Canonical entry list** (both files must have all of these):
+
+| Label | Key params |
+|-------|-----------|
+| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
+| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
+| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
+| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
+| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
+| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
+
+**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
+```
+net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
+numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+nowatchdog nosoftlockup
+```
+(fail-safe is the exception — it deliberately uses minimal params.)
+
+**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
+live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
+configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
+
 ## Memtest rule

 Do not assume live-build's built-in memtest integration is sufficient for `bee`.
@@ -0,0 +1,31 @@
+# Contract: ASCII-Safe Text in Scripts and Boot Configs
+
+Version: 1.0
+
+## Principle
+
+Shell scripts, bootloader configs, and any text rendered on serial/SOL consoles must use only printable ASCII characters. Non-ASCII Unicode — including typographic punctuation such as the em-dash (U+2014 `—`), en-dash (U+2013 `–`), curly quotes, and ellipsis (U+2026 `…`) — breaks rendering on serial terminals, GRUB text/serial mode, IPMI SOL, and tooling that assumes ASCII.
+
+## Rules
+
+- Never use em-dash (`—`) or en-dash (`–`) in any shell script, GRUB config, syslinux/isolinux config, or service unit file. Use ASCII double-hyphen `--` or single hyphen `-` instead.
+- Never use curly quotes (`"` `"` `'` `'`) in shell scripts or configs. Use straight quotes `"` and `'`.
+- Never use the Unicode ellipsis (`…`). Use `...`.
+- GRUB `menuentry` and `submenu` titles must be ASCII-only — GRUB serial terminal output is ASCII; non-ASCII characters render as garbage or are dropped.
+- Comments in GRUB theme files (`.txt`) must also be ASCII-only, as GRUB may parse the entire file.
+
+## Why
+
+GRUB renders menus over both `gfxterm` (graphical, Unicode-capable) and `serial` (ASCII-only) simultaneously when `terminal_output gfxterm serial` is set. The serial output — used by IPMI SOL and BMC remote consoles — cannot display multi-byte UTF-8 sequences and shows raw bytes or drops characters. A menuentry title `"EASY-BEE — GSP=off"` appears as `"EASY-BEE â€" GSP=off"` or `"EASY-BEE  GSP=off"` on SOL, making the menu unreadable.
+
+## Anti-patterns
+
+- `menuentry "EASY-BEE — GSP=off"` — em-dash in GRUB title
+- `# bee logo — centered` — em-dash in GRUB theme comment
+- `echo "done — reboot"` in a shell script displayed over serial
+
+## Correct form
+
+- `menuentry "EASY-BEE -- GSP=off"`
+- `# bee logo - centered`
+- `echo "done - reboot"`
@@ -31,10 +31,10 @@ Build with explicit SSH keys baked into the ISO:
 sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
 ```

-Rebuild the builder image:
+Force a clean rebuild of the builder image and build caches:

 ```sh
-sh iso/builder/build-in-container.sh --rebuild-image
+sh iso/builder/build-in-container.sh --clean-build
 ```

 Use a custom cache directory:
@@ -1,6 +1,7 @@
 DEBIAN_VERSION=12
 DEBIAN_KERNEL_ABI=auto
 NVIDIA_DRIVER_VERSION=590.48.01
+NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
 NCCL_VERSION=2.28.9-1
 NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
 COMGR_VERSION=2.8.0.60304-76~22.04
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
+MEMTEST_VERSION=6.10-4
@@ -23,16 +23,17 @@ lb config noauto \
    --bootloaders "grub-efi,syslinux" \
    --debian-installer none \
    --archive-areas "main contrib non-free non-free-firmware" \
-    --mirror-bootstrap "https://deb.debian.org/debian" \
-    --mirror-chroot "https://deb.debian.org/debian" \
-    --mirror-binary "https://deb.debian.org/debian" \
+    --mirror-bootstrap "http://mirror.mephi.ru/debian/" \
+    --mirror-chroot "http://mirror.mephi.ru/debian/" \
+    --mirror-binary "http://mirror.mephi.ru/debian/" \
    --security true \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --debootstrap-options "--include=ca-certificates" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
@@ -35,6 +35,8 @@ typedef void *CUstream;
 #define MAX_STRESS_STREAMS 16
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define MAX_SINGLE_PRECISION_STREAMS 4
+#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)

 static const char *ptx_source =
    ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }

+static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
+    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
+        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
+    }
+    return profile_budget_bytes;
+}
+
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -704,6 +713,19 @@ static const struct profile_desc k_profiles[] = {

 #define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))

+static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
+    if (!(desc->enabled && cc >= desc->min_cc)) {
+        return 0;
+    }
+    if (precision_filter != NULL) {
+        return strcmp(desc->block_label, precision_filter) == 0;
+    }
+    /* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
+     * unstable on the current benchmark fleet and can abort the whole mixed
+     * pass after earlier phases already collected useful telemetry. */
+    return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
+}
+
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -908,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
-    memset(out, 0, sizeof(*out));
-    out->desc = *desc;
-    out->stream = stream;
-
    size_t bytes_per_cell = 0;
+    size_t attempt_budget = profile_budget_bytes;
+
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,106 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }

-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
-    out->m = dim;
-    out->n = dim;
-    out->k = dim;
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
+        memset(out, 0, sizeof(*out));
+        out->desc = *desc;
+        out->stream = stream;

-    size_t desired_workspace = profile_budget_bytes / 8u;
-    if (desired_workspace > 32u * 1024u * 1024u) {
-        desired_workspace = 32u * 1024u * 1024u;
-    }
-    desired_workspace = round_down_size(desired_workspace, 256u);
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
+        out->m = dim;
+        out->n = dim;
+        out->k = dim;

-    size_t a_bytes = 0;
-    size_t b_bytes = 0;
-    size_t c_bytes = 0;
-    size_t d_bytes = 0;
-    size_t scale_bytes = 0;
-    while (1) {
-        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
-        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
-        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
-        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
-        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+        size_t desired_workspace = attempt_budget / 8u;
+        if (desired_workspace > 32u * 1024u * 1024u) {
+            desired_workspace = 32u * 1024u * 1024u;
+        }
+        desired_workspace = round_down_size(desired_workspace, 256u);

-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
-        if (matrix_bytes <= profile_budget_bytes) {
-            size_t remaining = profile_budget_bytes - matrix_bytes;
-            out->workspace_size = desired_workspace;
-            if (out->workspace_size > remaining) {
-                out->workspace_size = round_down_size(remaining, 256u);
+        size_t a_bytes = 0;
+        size_t b_bytes = 0;
+        size_t c_bytes = 0;
+        size_t d_bytes = 0;
+        size_t scale_bytes = 0;
+        while (1) {
+            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
+            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
+            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
+            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
+            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+
+            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+            if (matrix_bytes <= attempt_budget) {
+                size_t remaining = attempt_budget - matrix_bytes;
+                out->workspace_size = desired_workspace;
+                if (out->workspace_size > remaining) {
+                    out->workspace_size = round_down_size(remaining, 256u);
+                }
+                break;
            }
-            break;
+
+            if (out->m <= (uint64_t)desc->min_multiple) {
+                break;
+            }
+            out->m -= (uint64_t)desc->min_multiple;
+            out->n = out->m;
+            out->k = out->m;
+        }
+        if (out->m < (uint64_t)desc->min_multiple) {
+            attempt_budget /= 2u;
+            continue;
        }

-        if (out->m <= (uint64_t)desc->min_multiple) {
-            return 0;
-        }
-        out->m -= (uint64_t)desc->min_multiple;
-        out->n = out->m;
-        out->k = out->m;
-    }
-
-    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cudaDataType_t scale_type = matmul_scale_type(desc);
-    if (!check_cublas("cublasLtMatmulDescCreate",
-                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cublasOperation_t transa = CUBLAS_OP_T;
-    cublasOperation_t transb = CUBLAS_OP_N;
-    if (!check_cublas("set TRANSA",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSA,
-                                                             &transa,
-                                                             sizeof(transa))) ||
-        !check_cublas("set TRANSB",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSB,
-                                                             &transb,
-                                                             sizeof(transb)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (desc->needs_scalar_scale) {
-        float one = 1.0f;
-        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
-            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+
+        cudaDataType_t scale_type = matmul_scale_type(desc);
+        if (!check_cublas("cublasLtMatmulDescCreate",
+                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
-        if (!check_cublas("set A scale ptr",
+
+        cublasOperation_t transa = CUBLAS_OP_T;
+        cublasOperation_t transb = CUBLAS_OP_N;
+        if (!check_cublas("set TRANSA",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
-                                                                 &a_scale_ptr,
-                                                                 sizeof(a_scale_ptr))) ||
-            !check_cublas("set B scale ptr",
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
+                                                                 &transa,
+                                                                 sizeof(transa))) ||
+            !check_cublas("set TRANSB",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
-                                                                 &b_scale_ptr,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
+                                                                 &transb,
+                                                                 sizeof(transb)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-    }
+
+        if (desc->needs_scalar_scale) {
+            float one = 1.0f;
+            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
+                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+            if (!check_cublas("set A scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                     &a_scale_ptr,
+                                                                     sizeof(a_scale_ptr))) ||
+                !check_cublas("set B scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                     &b_scale_ptr,
+                                                                     sizeof(b_scale_ptr)))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }

 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
    if (desc->needs_block_scale) {
@@ -1060,62 +1089,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
    }
 #endif

-    if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (out->workspace_size > 0) {
-        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+        if (!check_cublas("create A layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+            !check_cublas("create B layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+            !check_cublas("create C layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+            !check_cublas("create D layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
+
+        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        if (out->workspace_size > 0) {
+            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }
+
+        if (!check_cublas("set workspace",
+                          cublas->cublasLtMatmulPreferenceSetAttribute(
+                              out->preference,
+                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                              &out->workspace_size,
+                              sizeof(out->workspace_size)))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        int found = 0;
+        if (check_cublas("heuristic",
+                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
+                                                                out->op_desc,
+                                                                out->a_layout,
+                                                                out->b_layout,
+                                                                out->c_layout,
+                                                                out->d_layout,
+                                                                out->preference,
+                                                                1,
+                                                                &out->heuristic,
+                                                                &found)) &&
+            found > 0) {
+            out->ready = 1;
+            return 1;
+        }
+
+        destroy_profile(cublas, cuda, out);
+        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
+        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
+            break;
+        }
    }

-    if (!check_cublas("set workspace",
-                      cublas->cublasLtMatmulPreferenceSetAttribute(
-                          out->preference,
-                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                          &out->workspace_size,
-                          sizeof(out->workspace_size)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    int found = 0;
-    if (!check_cublas("heuristic",
-                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
-                                                             out->op_desc,
-                                                             out->a_layout,
-                                                             out->b_layout,
-                                                             out->c_layout,
-                                                             out->d_layout,
-                                                             out->preference,
-                                                             1,
-                                                             &out->heuristic,
-                                                             &found))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-    if (found <= 0) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    out->ready = 1;
-    return 1;
+    return 0;
 }

 static int run_cublas_profile(cublasLtHandle_t handle,
@@ -1180,6 +1212,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
+    int budget_profiles = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1202,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,

    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
-            (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned++;
        }
    }
@@ -1215,30 +1247,41 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    }

    /* Count all profiles active on this GPU regardless of filter.
-     * Used as the budget divisor so matrix sizes stay consistent whether
-     * running all precisions together or a single-precision phase. */
+     * Mixed phases still divide budget across the full precision set, while
+     * single-precision benchmark phases dedicate budget only to active
+     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned_total++;
        }
    }
    if (planned_total < planned) {
        planned_total = planned;
    }
+    budget_profiles = planned_total;
+    if (precision_filter != NULL) {
+        budget_profiles = planned;
+    }
+    if (budget_profiles <= 0) {
+        budget_profiles = planned_total;
+    }

    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
+    }
+    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
+        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1251,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
+    if (precision_filter != NULL) {
+        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
+    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  mp_count,
+                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));

    for (int i = 0; i < profile_count; i++) {
@@ -1275,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
-        if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
+        if (!profile_allowed_for_run(desc, cc, precision_filter)) {
            append_detail(report->details,
                          sizeof(report->details),
-                          "%s=SKIPPED precision_filter\n",
+                          "%s=SKIPPED benchmark_disabled\n",
                          desc->name);
            continue;
        }
@@ -10,7 +10,6 @@ IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
 BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
 CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
 AUTH_KEYS=""
-REBUILD_IMAGE=0
 CLEAN_CACHE=0
 VARIANT="all"

@@ -22,17 +21,12 @@ while [ $# -gt 0 ]; do
            CACHE_DIR="$2"
            shift 2
            ;;
-        --rebuild-image)
-            REBUILD_IMAGE=1
-            shift
-            ;;
        --authorized-keys)
            AUTH_KEYS="$2"
            shift 2
            ;;
        --clean-build)
            CLEAN_CACHE=1
-            REBUILD_IMAGE=1
            shift
            ;;
        --variant)
@@ -41,7 +35,7 @@ while [ $# -gt 0 ]; do
            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
+            echo "usage: $0 [--cache-dir /path] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
            exit 1
            ;;
    esac
@@ -105,7 +99,7 @@ image_matches_platform() {
 }

 NEED_BUILD_IMAGE=0
-if [ "$REBUILD_IMAGE" = "1" ]; then
+if [ "$CLEAN_CACHE" = "1" ]; then
    NEED_BUILD_IMAGE=1
 elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
    NEED_BUILD_IMAGE=1
@@ -161,6 +155,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
+            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}" \
@@ -175,6 +170,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
+            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}"
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
 export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT

 . "${BUILDER_DIR}/VERSIONS"
+export MEMTEST_VERSION
 export PATH="$PATH:/usr/local/go/bin"
 : "${BEE_REQUIRE_MEMTEST:=0}"

@@ -125,6 +126,37 @@ resolve_iso_version() {
    resolve_audit_version
 }

+sync_builder_workdir() {
+    src_dir="$1"
+    dst_dir="$2"
+
+    mkdir -p "$dst_dir"
+
+    # Historical bug: old workdirs could keep config/bootloaders/grub-pc even
+    # after the source tree moved to grub-efi only. Remove bootloaders eagerly
+    # so reused workdirs cannot leak stale templates into a new ISO build.
+    rm -rf "$dst_dir/config/bootloaders"
+
+    rsync -a --delete \
+        --exclude='cache/' \
+        --exclude='chroot/' \
+        --exclude='.build/' \
+        --exclude='*.iso' \
+        --exclude='*.packages' \
+        --exclude='*.contents' \
+        --exclude='*.files' \
+        "$src_dir/" "$dst_dir/"
+
+    if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
+        echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
+        exit 1
+    fi
+    if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
+        echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
+        exit 1
+    fi
+}
+
 iso_list_files() {
    iso_path="$1"

@@ -202,7 +234,7 @@ dump_memtest_debug() {

        echo "-- source bootloader templates --"
        for cfg in \
-            "${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
+            "${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
            "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
            if [ -f "$cfg" ]; then
                echo "  file: $cfg"
@@ -465,6 +497,75 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }

+validate_iso_live_boot_entries() {
+    iso_path="$1"
+    echo "=== validating live boot entries in ISO ==="
+
+    [ -f "$iso_path" ] || {
+        echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
+        exit 1
+    }
+    require_iso_reader "$iso_path" >/dev/null 2>&1 || {
+        echo "ERROR: ISO reader unavailable for live boot validation" >&2
+        exit 1
+    }
+
+    grub_cfg="$(mktemp)"
+    isolinux_cfg="$(mktemp)"
+
+    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
+        echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
+        echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
+        echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    fi
+
+    grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
+        echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
+        echo "ERROR: GRUB toram entry is missing" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'linux .*boot=live ' "$grub_cfg" || {
+        echo "ERROR: GRUB live entry is missing boot=live" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
+        echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    grep -q 'append .*boot=live ' "$isolinux_cfg" || {
+        echo "ERROR: isolinux live entry is missing boot=live" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
+        echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    rm -f "$grub_cfg" "$isolinux_cfg"
+    echo "=== live boot validation OK ==="
+}
+
 validate_iso_nvidia_runtime() {
    iso_path="$1"
    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
@@ -541,6 +642,185 @@ label memtest
 EOF
 }

+extract_live_grub_entry() {
+    cfg="$1"
+    live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    [ -n "$live_linux" ] || return 1
+    [ -n "$live_initrd" ] || return 1
+
+    grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
+    grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
+    grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
+    [ -n "$grub_kernel" ] || return 1
+    [ -n "$grub_append" ] || return 1
+    [ -n "$grub_initrd" ] || return 1
+    return 0
+}
+
+load_live_build_append() {
+    lb_dir="$1"
+    binary_cfg="$lb_dir/config/binary"
+    [ -f "$binary_cfg" ] || return 1
+
+    # config/binary is generated by live-build and contains shell variable
+    # assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
+    # shellcheck disable=SC1090
+    . "$binary_cfg"
+
+    [ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
+    live_build_append="$LB_BOOTAPPEND_LIVE"
+    return 0
+}
+
+extract_live_isolinux_entry() {
+    cfg="$1"
+    isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
+    [ -n "$isolinux_linux" ] || return 1
+    [ -n "$isolinux_initrd" ] || return 1
+    [ -n "$isolinux_append" ] || return 1
+
+    isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
+    isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
+    [ -n "$isolinux_kernel" ] || return 1
+    [ -n "$isolinux_initrd_path" ] || return 1
+    return 0
+}
+
+write_canonical_grub_cfg() {
+    cfg="$1"
+    kernel="$2"
+    append_live="$3"
+    initrd="$4"
+
+    cat > "$cfg" <<EOF
+source /boot/grub/config.cfg
+
+echo ""
+echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
+echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
+echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
+echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
+echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
+echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
+echo "  Hardware Audit LiveCD"
+echo ""
+
+menuentry "EASY-BEE" {
+    linux   ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  ${initrd}
+}
+
+menuentry "EASY-BEE -- load to RAM (toram)" {
+    linux   ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  ${initrd}
+}
+
+
+if [ "\${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+
+if [ "\${grub_platform}" = "efi" ]; then
+    menuentry "UEFI Firmware Settings" {
+        fwsetup
+    }
+fi
+EOF
+}
+
+write_canonical_isolinux_cfg() {
+    cfg="$1"
+    kernel="$2"
+    initrd="$3"
+    append_live="$4"
+
+    cat > "$cfg" <<EOF
+label live-@FLAVOUR@-normal
+    menu label ^EASY-BEE
+    menu default
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-toram
+    menu label EASY-BEE (^load to RAM)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-gsp-off
+    menu label EASY-BEE (^NVIDIA GSP=off)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^KMS, no nomodeset)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-kms-gsp-off
+    menu label EASY-BEE (KMS, ^GSP=off)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-failsafe
+    menu label EASY-BEE (^fail-safe)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
+EOF
+}
+
+enforce_live_build_bootloader_assets() {
+    lb_dir="$1"
+    grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
+    grub_dir="$lb_dir/binary/boot/grub"
+    isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
+
+    if ! load_live_build_append "$lb_dir"; then
+        echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
+        live_build_append=""
+    fi
+
+    if [ -f "$grub_cfg" ]; then
+        if extract_live_grub_entry "$grub_cfg"; then
+            mkdir -p "$grub_dir/live-theme"
+            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
+            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
+            cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
+            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
+            echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
+        else
+            echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
+        fi
+    fi
+
+    if [ -f "$isolinux_cfg" ]; then
+        if extract_live_isolinux_entry "$isolinux_cfg"; then
+            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
+            echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
+        else
+            echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
+        fi
+    fi
+}
+
 copy_memtest_from_deb() {
    deb="$1"
    dst_boot="$2"
@@ -568,6 +848,73 @@ reset_live_build_stage() {
    done
 }

+# Marker written after every successful full lb build for this variant
+FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
+
+# Returns 0 if full lb build is needed, 1 if fast-path is safe.
+# Fast-path is safe when only light files changed since the last full build
+# (Go source, overlay scripts/configs). Heavy changes (VERSIONS, package lists,
+# hooks, archives, Dockerfile, auto/config) require a full lb build.
+needs_full_build() {
+    [ -f "${FULL_BUILD_MARKER}" ]                                        || return 0
+    [ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ]           || return 0
+    [ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ]               || return 0
+
+    _heavy=$(find \
+        "${BUILDER_DIR}/VERSIONS" \
+        "${BUILDER_DIR}/auto/config" \
+        "${BUILDER_DIR}/Dockerfile" \
+        "${BUILDER_DIR}/config/package-lists" \
+        "${BUILDER_DIR}/config/hooks" \
+        "${BUILDER_DIR}/config/archives" \
+        "${BUILDER_DIR}/config/bootloaders" \
+        -newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1)
+
+    if [ -n "$_heavy" ]; then
+        echo "=== full build required: heavy config changed: $(basename "$_heavy") ==="
+        return 0
+    fi
+
+    return 1
+}
+
+# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
+# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
+fast_path_repack_squashfs() {
+    _sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
+    _tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
+    echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
+    rm -rf "$_tmp"
+    unsquashfs -d "$_tmp" "$_sq"
+    echo "=== fast-path: syncing overlay stage ==="
+    rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
+    echo "=== fast-path: repacking squashfs ==="
+    _sq_new="${_sq}.new"
+    rm -f "$_sq_new"
+    mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
+    mv "$_sq_new" "$_sq"
+    rm -rf "$_tmp"
+    echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
+}
+
+# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
+# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
+fast_path_rebuild_iso() {
+    _sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
+    _prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
+    _new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
+    echo "=== fast-path: rebuilding ISO with xorriso ==="
+    rm -f "$_new"
+    xorriso \
+        -indev  "$_prior" \
+        -outdev "$_new" \
+        -map    "$_sq" /live/filesystem.squashfs \
+        -boot_image any replay \
+        -commit
+    mv "$_new" "$_prior"
+    echo "=== fast-path: ISO rebuilt ==="
+}
+
 recover_iso_memtest() {
    lb_dir="$1"
    iso_path="$2"
@@ -775,6 +1122,7 @@ run_optional_step_sh() {
        return 0
    fi

+    mkdir -p "${LOG_DIR}" 2>/dev/null || true
    step_log="${LOG_DIR}/${step_slug}.log"
    echo ""
    echo "=== optional step: ${step_name} ==="
@@ -798,13 +1146,14 @@ start_build_log
 # install them on the fly so NVIDIA modules and ISO kernel always match.
 if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
    echo "=== refreshing apt index to detect current kernel ABI ==="
-    apt-get update -qq
+    apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
    DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
        | awk '/Depends:.*linux-image-[0-9]/{print $2}' \
        | grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
        | head -1)
    if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
        echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
+        echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
        exit 1
    fi
    echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
@@ -929,15 +1278,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"

 # Sync builder config into variant work dir, preserving lb cache.
-rsync -a --delete \
-    --exclude='cache/' \
-    --exclude='chroot/' \
-    --exclude='.build/' \
-    --exclude='*.iso' \
-    --exclude='*.packages' \
-    --exclude='*.contents' \
-    --exclude='*.files' \
-    "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
+sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"

 # Share deb package cache across variants.
 # Restore: populate work dir cache from shared cache before build.
@@ -951,86 +1292,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi

-if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
-    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
-source /boot/grub/config.cfg
-
-echo ""
-echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
-echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
-echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
-echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
-echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
-echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
-echo "  Hardware Audit LiveCD"
-echo ""
-
-menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-submenu "EASY-BEE (advanced options) -->" {
-    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — fail-safe" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-        initrd  @INITRD_LIVE@
-    }
-}
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "Memory Test (memtest86+)" {
-        chainloader /boot/memtest86+x64.efi
-    }
-else
-    menuentry "Memory Test (memtest86+)" {
-        linux16 /boot/memtest86+x64.bin
-    }
-fi
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "UEFI Firmware Settings" {
-        fwsetup
-    }
-fi
-EOF
-
-    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
-label live-@FLAVOUR@-normal
-    menu label ^EASY-BEE
-    menu default
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@
-
-label live-@FLAVOUR@-kms
-    menu label EASY-BEE (^graphics/KMS)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms
-
-label live-@FLAVOUR@-toram
-    menu label EASY-BEE (^load to RAM)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ toram
-
-label live-@FLAVOUR@-failsafe
-    menu label EASY-BEE (^fail-safe)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
-
-label memtest
-    menu label ^Memory Test (memtest86+)
-    linux /boot/memtest86+x64.bin
-EOF
-fi
-
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -1259,6 +1520,7 @@ fi
 # --- substitute version placeholders in package list and archive ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    sed -i \
+        -e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
        -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
        "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
@@ -1292,6 +1554,21 @@ if [ -f "${LB_INCLUDES}/root/.ssh/authorized_keys" ]; then
    chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
 fi

+# --- auto fast-path: squashfs surgery if only light files changed ---
+if ! needs_full_build; then
+    echo "=== fast-path build (no heavy config changes since last full build) ==="
+    fast_path_repack_squashfs
+    fast_path_rebuild_iso
+    ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
+    validate_iso_live_boot_entries "$ISO_RAW"
+    validate_iso_nvidia_runtime "$ISO_RAW"
+    cp "$ISO_RAW" "$ISO_OUT"
+    echo ""
+    echo "=== done (${BUILD_VARIANT}, fast-path) ==="
+    echo "ISO: $ISO_OUT"
+    exit 0
+fi
+
 # --- build ISO using live-build ---
 echo ""
 echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
@@ -1301,10 +1578,18 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER

 cd "${LB_DIR}"
-run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
+run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
 run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
 dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
+echo "=== enforcing canonical bootloader assets ==="
+enforce_live_build_bootloader_assets "${LB_DIR}"
+reset_live_build_stage "${LB_DIR}" "binary_checksums"
+reset_live_build_stage "${LB_DIR}" "binary_iso"
+reset_live_build_stage "${LB_DIR}" "binary_zsync"
+run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
+run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
+run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"

 # --- persist deb package cache back to shared location ---
 # This allows the second variant to reuse all downloaded packages.
@@ -1329,8 +1614,10 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
+    validate_iso_live_boot_entries "$ISO_RAW"
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
+    touch "${FULL_BUILD_MARKER}"
    echo ""
    echo "=== done (${BUILD_VARIANT}) ==="
    echo "ISO: $ISO_OUT"
@@ -23,9 +23,9 @@ insmod serial
 serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1

 insmod gfxterm
-insmod png
-
-source /boot/grub/theme.cfg

 terminal_input console serial
 terminal_output gfxterm serial
+
+insmod png
+source /boot/grub/theme.cfg
@@ -0,0 +1,28 @@
+source /boot/grub/config.cfg
+
+menuentry "EASY-BEE" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
+}
+
+menuentry "EASY-BEE -- load to RAM (toram)" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
+}
+
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+
+if [ "${grub_platform}" = "efi" ]; then
+    menuentry "UEFI Firmware Settings" {
+        fwsetup
+    }
+fi
@@ -5,6 +5,13 @@ title-text: ""
 message-font: "Unifont Regular 16"
 terminal-font: "Unifont Regular 16"

+#bee logo - centered, upper third of screen
+ image {
+        top = 4%
+        left = 50%-200
+        file = "bee-logo.png"
+}
+
 #help bar at the bottom
 + label {
        top = 100%-50
@@ -21,17 +28,17 @@ terminal-font: "Unifont Regular 16"
 + boot_menu {
        left = 20%
        width = 60%
-        top = 62%
-        height = 38%-80
+        top = 65%
+        height = 35%-80
        item_color = "#c88000"
        item_font = "Unifont Regular 16"
        selected_item_color= "#f5a800"
        selected_item_font = "Unifont Regular 16"
-        item_height = 16
-        item_padding = 0
+        item_height = 20
+        item_padding = 2
        item_spacing = 4
        icon_width = 0
-        icon_heigh = 0
+        icon_height = 0
        item_icon_space = 0
 }

@@ -1,7 +1,7 @@
 set color_normal=light-gray/black
 set color_highlight=yellow/black

-if [ -e /boot/grub/splash.png ]; then
+if [ -e /boot/grub/live-theme/theme.txt ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
    set menu_color_normal=yellow/black
@@ -1,49 +0,0 @@
-source /boot/grub/config.cfg
-
-echo ""
-echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
-echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
-echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
-echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
-echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
-echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
-echo "  Hardware Audit LiveCD"
-echo ""
-
-menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-submenu "EASY-BEE (advanced options) -->" {
-    menuentry "EASY-BEE — GSP=off" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — fail-safe" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-        initrd  @INITRD_LIVE@
-    }
-}
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "Memory Test (memtest86+)" {
-        chainloader /boot/memtest86+x64.efi
-    }
-else
-    menuentry "Memory Test (memtest86+)" {
-        linux16 /boot/memtest86+x64.bin
-    }
-fi
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "UEFI Firmware Settings" {
-        fwsetup
-    }
-fi
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
    menu default
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
-
-label live-@FLAVOUR@-kms
-    menu label EASY-BEE (^graphics/KMS)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

-label live-@FLAVOUR@-kms-gsp-off
-    menu label EASY-BEE (g^raphics/KMS, GSP=off)
+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^KMS, no nomodeset)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-kms-gsp-off
+    menu label EASY-BEE (KMS, ^GSP=off)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0

 label memtest
    menu label ^Memory Test (memtest86+)
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
 # Enable GPU-vendor specific services
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    systemctl enable nvidia-dcgm.service 2>/dev/null || true
+    systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
    systemctl enable bee-nvidia.service
 elif [ "$GPU_VENDOR" = "amd" ]; then
    # ROCm symlinks (packages install to /opt/rocm-*/bin/)
@@ -62,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
-chmod +x /usr/local/bin/bee-selfheal      2>/dev/null || true
-chmod +x /usr/local/bin/bee-boot-status  2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal        2>/dev/null || true
+chmod +x /usr/local/bin/bee-boot-status    2>/dev/null || true
+chmod +x /usr/local/bin/bee-install        2>/dev/null || true
+chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
@@ -1,117 +0,0 @@
-#!/bin/sh
-# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
-set -e
-echo "=== generating bee wallpaper ==="
-mkdir -p /usr/share/bee
-
-python3 - <<'PYEOF'
-from PIL import Image, ImageDraw, ImageFont, ImageFilter
-import os
-
-W, H = 1920, 1080
-
-ASCII_ART = [
-    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
-    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
-    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
-    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
-    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
-    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
-]
-SUBTITLE = "  Hardware Audit LiveCD"
-
-FG = (0xF6, 0xD0, 0x47)
-FG_DIM = (0xD4, 0xA9, 0x1C)
-SHADOW = (0x5E, 0x47, 0x05)
-SUB = (0x96, 0x7A, 0x17)
-BG = (0x05, 0x05, 0x05)
-
-MONO_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
-]
-SUB_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
-]
-
-
-def load_font(candidates, size):
-    for path in candidates:
-        if os.path.exists(path):
-            return ImageFont.truetype(path, size)
-    return ImageFont.load_default()
-
-
-def mono_metrics(font):
-    probe = Image.new('L', (W, H), 0)
-    draw = ImageDraw.Draw(probe)
-    char_w = int(round(draw.textlength("M", font=font)))
-    bb = draw.textbbox((0, 0), "Mg", font=font)
-    char_h = bb[3] - bb[1]
-    return char_w, char_h
-
-
-def render_ascii_mask(font, lines, char_w, char_h, line_gap):
-    width = max(len(line) for line in lines) * char_w
-    height = len(lines) * char_h + line_gap * (len(lines) - 1)
-    mask = Image.new('L', (width, height), 0)
-    draw = ImageDraw.Draw(mask)
-    for row, line in enumerate(lines):
-        y = row * (char_h + line_gap)
-        for col, ch in enumerate(line):
-            if ch == ' ':
-                continue
-            x = col * char_w
-            draw.text((x, y), ch, font=font, fill=255)
-    return mask
-
-
-img = Image.new('RGB', (W, H), BG)
-draw = ImageDraw.Draw(img)
-
-# Soft amber glow under the logo without depending on font rendering.
-glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
-glow_draw = ImageDraw.Draw(glow)
-glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
-glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
-glow = glow.filter(ImageFilter.GaussianBlur(60))
-img = Image.alpha_composite(img.convert('RGBA'), glow)
-
-TARGET_LOGO_W = 400
-max_chars = max(len(line) for line in ASCII_ART)
-_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
-_probe_cw, _ = mono_metrics(_probe_font)
-font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
-font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
-char_w, char_h = mono_metrics(font_logo)
-logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
-logo_w, logo_h = logo_mask.size
-logo_x = (W - logo_w) // 2
-logo_y = 380
-
-sh_off = max(1, font_size_logo // 6)
-shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
-img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
-img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
-img.paste(FG, (logo_x, logo_y), logo_mask)
-
-font_sub = load_font(SUB_FONT_CANDIDATES, 30)
-sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
-sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
-sub_y = logo_y + logo_h + 48
-draw = ImageDraw.Draw(img)
-draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
-draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
-
-img = img.convert('RGB')
-
-img.save('/usr/share/bee/wallpaper.png', optimize=True)
-print('wallpaper written: /usr/share/bee/wallpaper.png')
-PYEOF
-
-echo "=== wallpaper done ==="
@@ -0,0 +1,46 @@
+#!/bin/sh
+# 9011-toram-rsync.hook.chroot
+#
+# Adds rsync to the initramfs so that live-boot's toram code takes the
+# rsync --progress path instead of the silent "cp -a" fallback.
+#
+# live-boot's 9990-toram-todisk.sh already contains:
+#   if [ -x /bin/rsync ]; then
+#       rsync -a --progress ... 1>/dev/console
+#   else
+#       cp -a ...   # no output
+#   fi
+#
+# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
+# which copies the binary + all shared-library dependencies into the initrd.
+
+set -e
+
+HOOK_DIR="/etc/initramfs-tools/hooks"
+HOOK="${HOOK_DIR}/bee-rsync"
+
+mkdir -p "${HOOK_DIR}"
+
+cat > "${HOOK}" << 'EOF'
+#!/bin/sh
+# initramfs hook: include rsync for live-boot toram progress output
+PREREQ=""
+prereqs() { echo "$PREREQ"; }
+case "$1" in prereqs) prereqs; exit 0 ;; esac
+
+. /usr/share/initramfs-tools/hook-functions
+
+if [ -x /usr/bin/rsync ]; then
+    copy_exec /usr/bin/rsync /bin
+fi
+EOF
+
+chmod +x "${HOOK}"
+
+echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
+
+# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
+KVER=$(ls /lib/modules | sort -V | tail -1)
+echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
+update-initramfs -u -k "${KVER}"
+echo "9011-toram-rsync: done"
@@ -5,6 +5,8 @@ set -e

 : "${BEE_REQUIRE_MEMTEST:=0}"

+# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
+# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
 MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
 BINARY_BOOT_DIR="binary/boot"
 GRUB_CFG="binary/boot/grub/grub.cfg"
@@ -24,15 +26,23 @@ fail_or_warn() {
    return 0
 }

+# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
+# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
+# The template already has memtest entries hardcoded, so a missing config file
+# here is not an error; validate_iso_memtest() checks the final ISO instead.
+warn_only() {
+    log "WARNING: $1"
+}
+
 copy_memtest_file() {
    src="$1"
-    base="$(basename "$src")"
-    dst="${BINARY_BOOT_DIR}/${base}"
+    dst_name="${2:-$(basename "$src")}"
+    dst="${BINARY_BOOT_DIR}/${dst_name}"

    [ -f "$src" ] || return 1
    mkdir -p "${BINARY_BOOT_DIR}"
    cp "$src" "$dst"
-    log "copied ${base} from ${src}"
+    log "copied ${dst_name} from ${src}"
 }

 extract_memtest_from_deb() {
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {

    log "extracting memtest payload from ${deb}"
    dpkg-deb -x "$deb" "$tmpdir"
-    for f in ${MEMTEST_FILES}; do
-        if [ -f "${tmpdir}/boot/${f}" ]; then
-            copy_memtest_file "${tmpdir}/boot/${f}"
-        fi
-    done
+
+    # EFI binary: both 5.x and 6.x use memtest86+x64.efi
+    if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
+    fi
+
+    # BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
+    if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
+    elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
+    fi
+
    rm -rf "$tmpdir"
 }

+download_and_extract_memtest() {
+    tmpdl="$(mktemp -d)"
+    if [ -n "${MEMTEST_VERSION:-}" ]; then
+        pkg_spec="memtest86+=${MEMTEST_VERSION}"
+    else
+        pkg_spec="memtest86+"
+    fi
+    log "downloading ${pkg_spec} from apt"
+    if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
+        log "apt download failed, retrying after apt-get update"
+        apt-get update -qq >/dev/null 2>&1 || true
+        ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
+    fi
+    deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
+    if [ -n "$deb" ]; then
+        extract_memtest_from_deb "$deb"
+    else
+        log "apt download of memtest86+ failed"
+    fi
+    rm -rf "$tmpdl"
+}
+
 ensure_memtest_binaries() {
    missing=0
    for f in ${MEMTEST_FILES}; do
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0

+    # 1. Try files already placed by lb binary_memtest or chroot
    for root in chroot/boot /boot; do
        for f in ${MEMTEST_FILES}; do
            [ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
        done
+        # 6.x BIOS binary may lack x64 in name — copy with normalised name
+        if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
+            copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
+        fi
    done

    missing=0
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0

+    # 2. Try apt package cache (may be empty if lb binary_memtest already purged)
    for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
        [ -d "$root" ] || continue
        deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
        break
    done

+    missing=0
+    for f in ${MEMTEST_FILES}; do
+        [ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
+    done
+    [ "$missing" -eq 1 ] || return 0
+
+    # 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
+    download_and_extract_memtest
+
    missing=0
    for f in ${MEMTEST_FILES}; do
        if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {

 ensure_grub_entry() {
    [ -f "$GRUB_CFG" ] || {
-        fail_or_warn "missing ${GRUB_CFG}"
+        warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
        return 0
    }

@@ -114,7 +169,7 @@ EOF

 ensure_isolinux_entry() {
    [ -f "$ISOLINUX_CFG" ] || {
-        fail_or_warn "missing ${ISOLINUX_CFG}"
+        warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
        return 0
    }

@@ -5,6 +5,7 @@
 # DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
 # CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
 # explicitly.
+nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
 datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
@@ -3,6 +3,7 @@ dmidecode
 smartmontools
 nvme-cli
 pciutils
+rsync
 ipmitool
 util-linux
 e2fsprogs
@@ -46,18 +47,30 @@ vim-tiny
 mc
 htop
 nvtop
-btop
 sudo
 zstd
 mstflint
 memtester
 stress-ng
 stressapptest
+fio
+iperf3
+iotop
+nload
+tcpdump
+hdparm
+sysstat
+lsscsi
+sg3-utils
+jq
+curl
+net-tools

 # QR codes (for displaying audit results)
 qrencode

 # Local desktop (openbox + chromium kiosk)
+gparted
 openbox
 tint2
 feh
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: hardware audit
-After=bee-preflight.service bee-network.service bee-nvidia.service
+After=bee-preflight.service bee-network.service bee-nvidia.service bee-blackbox.service

 [Service]
 Type=oneshot
@@ -0,0 +1,18 @@
+[Unit]
+Description=Bee: USB black-box log mirror
+After=local-fs.target
+Before=bee-network.service bee-nvidia.service bee-preflight.service bee-audit.service bee-web.service
+StartLimitIntervalSec=0
+
+[Service]
+Type=simple
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-blackbox.log /usr/local/bin/bee blackbox --export-dir /appdata/bee/export --state-file /appdata/bee/export/blackbox-state.json
+Restart=always
+RestartSec=1
+StandardOutput=journal
+StandardError=journal
+OOMScoreAdjust=-900
+Nice=0
+
+[Install]
+WantedBy=multi-user.target
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: bring up network interfaces via DHCP
-After=local-fs.target
+After=local-fs.target bee-blackbox.service
 Before=network-online.target bee-audit.service

 [Service]
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: load NVIDIA kernel modules and create device nodes
-After=local-fs.target udev.service
+After=local-fs.target udev.service bee-blackbox.service
 Before=bee-audit.service

 [Service]
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: runtime preflight self-check
-After=bee-network.service bee-nvidia.service
+After=bee-network.service bee-nvidia.service bee-blackbox.service
 Before=bee-audit.service

 [Service]
@@ -1,5 +1,6 @@
 [Unit]
 Description=Bee: hardware audit web viewer
+After=bee-blackbox.service
 StartLimitIntervalSec=0

 [Service]
@@ -10,6 +11,8 @@ RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
+# No MemoryMax: bee-web spawns GPU test subprocesses (dcgmproftester etc.)
+# that legitimately use several GB; a cgroup limit kills them via OOM.
 # Keep the web server responsive during GPU/CPU stress (children inherit nice+10
 # via Setpriority in runCmdJob, but the bee-web parent stays at 0).
 Nice=0
@@ -65,6 +65,9 @@ done
 SQUASHFS="/run/live/medium/live/filesystem.squashfs"
 if [ ! -f "$SQUASHFS" ]; then
    echo "ERROR: squashfs not found at $SQUASHFS" >&2
+    echo "  The live medium may have been disconnected." >&2
+    echo "  Reconnect the disc and run:  bee-remount-medium --wait" >&2
+    echo "  Then re-run bee-install." >&2
    exit 1
 fi

@@ -162,10 +165,59 @@ log "  Mounted."
 log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
 log "  Source: $SQUASHFS"
 log "  Target: $MOUNT_ROOT"
-unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
-    grep -E '^\[|^inod|^created|^extract' | \
-    while read -r line; do log "  $line"; done || true
-log "  Unpack complete."
+
+# unsquashfs does not support resume, so retry the entire unpack step if the
+# source medium disappears mid-copy (e.g. CD physically disconnected).
+UNPACK_ATTEMPTS=0
+UNPACK_MAX=5
+while true; do
+    UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
+    if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
+        die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
+    fi
+    [ "$UNPACK_ATTEMPTS" -gt 1 ] && log "  Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
+
+    # Re-check squashfs is reachable before each attempt
+    if [ ! -f "$SQUASHFS" ]; then
+        log "  SOURCE LOST: $SQUASHFS not found."
+        log "  Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
+        log "  then press Enter here to retry."
+        read -r _
+        continue
+    fi
+
+    # wipe partial unpack so unsquashfs starts clean
+    if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
+        log "  Cleaning partial unpack from $MOUNT_ROOT ..."
+        # keep the mount point itself but remove its contents
+        find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
+    fi
+
+    UNPACK_OK=0
+    unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
+        grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
+        while IFS= read -r line; do log "  $line"; done || UNPACK_OK=$?
+
+    # Check squashfs is still reachable (gone = disc pulled during copy)
+    if [ ! -f "$SQUASHFS" ]; then
+        log "  WARNING: source medium lost during unpack — will retry after remount."
+        log "  Run 'bee-remount-medium --wait' in another terminal, then press Enter."
+        read -r _
+        continue
+    fi
+
+    # Verify the unpack produced a usable root (presence of /etc is a basic check)
+    if [ -d "${MOUNT_ROOT}/etc" ]; then
+        log "  Unpack complete."
+        break
+    else
+        log "  WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
+        if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
+            log "  Retrying in 5 s ..."
+            sleep 5
+        fi
+    fi
+done

 # ------------------------------------------------------------------
 log "--- Step 6/7: Configuring installed system ---"
@@ -258,6 +258,22 @@ else
    log "WARN: nvidia-smi not found — cannot enable persistence mode"
 fi

+# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
+# systems CUDA/DCGM can report "system not yet initialized" until fabric
+# training completes under nvidia-fabricmanager.
+if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
+    if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
+        log "nvidia-fabricmanager restarted"
+    elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
+        log "nvidia-fabricmanager started"
+    else
+        log "WARN: failed to start nvidia-fabricmanager.service"
+        systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/  fabricmanager: /' || true
+    fi
+else
+    log "WARN: nvidia-fabricmanager.service not installed"
+fi
+
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
 # If it started too early (for example via systemd before bee-nvidia-load), it can
@@ -0,0 +1,326 @@
+#!/bin/sh
+# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
+
+set -u
+
+log() {
+    echo "[bee-nvidia-recover] $*"
+}
+
+log_blocker() {
+    echo "[bee-nvidia-recover] blocker: $*"
+}
+
+usage() {
+    cat <<'EOF'
+usage:
+  bee-nvidia-recover restart-drivers
+  bee-nvidia-recover reset-gpu <index>
+EOF
+}
+
+unit_exists() {
+    systemctl cat "$1" >/dev/null 2>&1
+}
+
+unit_is_active() {
+    systemctl is-active --quiet "$1" 2>/dev/null
+}
+
+stop_unit_if_active() {
+    unit="$1"
+    if unit_is_active "$unit"; then
+        log "stopping $unit"
+        systemctl stop "$unit"
+        return 0
+    fi
+    return 1
+}
+
+start_unit_if_marked() {
+    unit="$1"
+    marker="$2"
+    if [ "$marker" = "1" ] && unit_exists "$unit"; then
+        log "starting $unit"
+        systemctl start "$unit"
+    fi
+}
+
+wait_for_process_exit() {
+    name="$1"
+    tries=0
+    while pgrep -x "$name" >/dev/null 2>&1; do
+        tries=$((tries + 1))
+        if [ "$tries" -ge 15 ]; then
+            log "WARN: $name is still running after stop request"
+            return 1
+        fi
+        sleep 1
+    done
+    return 0
+}
+
+log_pid_details() {
+    pid="$1"
+    line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
+    if [ -n "$line" ]; then
+        log_blocker "$line"
+    else
+        log_blocker "pid $pid"
+    fi
+}
+
+collect_gpu_compute_pids() {
+    index="$1"
+    if ! command -v nvidia-smi >/dev/null 2>&1; then
+        return 0
+    fi
+    nvidia-smi --id="$index" \
+        --query-compute-apps=pid \
+        --format=csv,noheader,nounits 2>/dev/null \
+        | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
+        | grep -E '^[0-9]+$' || true
+}
+
+collect_gpu_device_pids() {
+    index="$1"
+    dev="/dev/nvidia$index"
+    [ -e "$dev" ] || return 0
+    if command -v fuser >/dev/null 2>&1; then
+        fuser "$dev" 2>/dev/null \
+            | tr ' ' '\n' \
+            | sed 's/[^0-9].*$//' \
+            | grep -E '^[0-9]+$' || true
+    fi
+}
+
+collect_gpu_holder_pids() {
+    index="$1"
+    {
+        collect_gpu_compute_pids "$index"
+        collect_gpu_device_pids "$index"
+    } | awk 'NF' | sort -u
+}
+
+kill_pid_list() {
+    pids="$1"
+    [ -n "$pids" ] || return 0
+
+    for pid in $pids; do
+        log_pid_details "$pid"
+    done
+    log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
+    for pid in $pids; do
+        kill -TERM "$pid" >/dev/null 2>&1 || true
+    done
+    sleep 1
+    for pid in $pids; do
+        if kill -0 "$pid" >/dev/null 2>&1; then
+            log "forcing GPU holder PID $pid to exit"
+            kill -KILL "$pid" >/dev/null 2>&1 || true
+        fi
+    done
+}
+
+gpu_has_display_holders() {
+    index="$1"
+    holders=$(collect_gpu_device_pids "$index")
+    [ -n "$holders" ] || return 1
+    for pid in $holders; do
+        comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+        case "$comm" in
+            Xorg|Xwayland|X|gnome-shell)
+                return 0
+                ;;
+        esac
+    done
+    return 1
+}
+
+stop_nv_hostengine_if_running() {
+    if pgrep -x nv-hostengine >/dev/null 2>&1; then
+        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
+            [ -n "$line" ] || continue
+            log_blocker "$line"
+        done
+        log "stopping nv-hostengine"
+        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
+        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+        hostengine_was_active=1
+        return 0
+    fi
+    return 1
+}
+
+stop_fabricmanager_if_active() {
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+        return 0
+    fi
+    return 1
+}
+
+stop_display_stack_if_active() {
+    stopped=1
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+            stopped=0
+        fi
+    done
+    return "$stopped"
+}
+
+try_gpu_reset() {
+    index="$1"
+    log "resetting GPU $index"
+    nvidia-smi -r -i "$index"
+}
+
+drain_gpu_clients() {
+    display_was_active=0
+    fabric_was_active=0
+    hostengine_was_active=0
+
+    if pgrep -x nv-hostengine >/dev/null 2>&1; then
+        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
+            [ -n "$line" ] || continue
+            log_blocker "$line"
+        done
+        log "stopping nv-hostengine"
+        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
+        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+        hostengine_was_active=1
+    fi
+
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+    fi
+
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+        fi
+    done
+
+    for dev in /dev/nvidia[0-9]*; do
+        [ -e "$dev" ] || continue
+        holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
+        kill_pid_list "$holders"
+    done
+}
+
+restore_gpu_clients() {
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        if nvidia-smi -pm 1 >/dev/null 2>&1; then
+            log "enabled NVIDIA persistence mode"
+        else
+            log "WARN: failed to enable NVIDIA persistence mode"
+        fi
+    fi
+
+    if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
+        log "starting nv-hostengine"
+        nv-hostengine
+    fi
+
+    start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
+    start_unit_if_marked display-manager.service "${display_was_active:-0}"
+    if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
+        start_unit_if_marked lightdm.service "1"
+    fi
+}
+
+restart_drivers() {
+    drain_gpu_clients
+    for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
+        if lsmod | awk '{print $1}' | grep -qx "$mod"; then
+            log "unloading module $mod"
+            rmmod "$mod"
+        fi
+    done
+    rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
+    log "reloading NVIDIA driver stack"
+    /usr/local/bin/bee-nvidia-load
+    restore_gpu_clients
+}
+
+reset_gpu() {
+    index="$1"
+    display_was_active=0
+    fabric_was_active=0
+    hostengine_was_active=0
+
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    stop_nv_hostengine_if_running || true
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    stop_fabricmanager_if_active || true
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    if gpu_has_display_holders "$index"; then
+        stop_display_stack_if_active || true
+        holders=$(collect_gpu_holder_pids "$index")
+        if [ -n "$holders" ]; then
+            kill_pid_list "$holders"
+        fi
+        if try_gpu_reset "$index"; then
+            restore_gpu_clients
+            return 0
+        fi
+    fi
+
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        log "GPU $index still has holders after targeted drain"
+        kill_pid_list "$holders"
+    fi
+    try_gpu_reset "$index"
+    rc=$?
+    restore_gpu_clients
+    return "$rc"
+}
+
+cmd="${1:-}"
+case "$cmd" in
+    restart-drivers)
+        restart_drivers
+        ;;
+    reset-gpu)
+        if [ "$#" -ne 2 ]; then
+            usage >&2
+            exit 2
+        fi
+        reset_gpu "$2"
+        ;;
+    *)
+        usage >&2
+        exit 2
+        ;;
+esac
@@ -9,9 +9,9 @@ xset s noblank

 # Set desktop background.
 if [ -f /usr/share/bee/wallpaper.png ]; then
-    feh --bg-fill /usr/share/bee/wallpaper.png
+    feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
 else
-    xsetroot -solid '#f6c90e'
+    xsetroot -solid '#000000'
 fi

 tint2 &
@@ -0,0 +1,100 @@
+#!/bin/bash
+# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
+#
+# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
+# was lost and /run/live/medium/live/filesystem.squashfs is missing.
+#
+# Usage: bee-remount-medium [--wait]
+#   --wait  keep retrying every 5 seconds until the medium is found (useful
+#           while physically reconnecting the device)
+
+set -euo pipefail
+
+MEDIUM_DIR="/run/live/medium"
+SQUASHFS_REL="live/filesystem.squashfs"
+WAIT_MODE=0
+
+for arg in "$@"; do
+    case "$arg" in
+        --wait|-w) WAIT_MODE=1 ;;
+        --help|-h)
+            echo "Usage: bee-remount-medium [--wait]"
+            echo "  Finds and remounts the live ISO medium to $MEDIUM_DIR"
+            echo "  --wait  retry every 5 s until a medium with squashfs is found"
+            exit 0 ;;
+    esac
+done
+
+log() { echo "[$(date +%H:%M:%S)] $*"; }
+die() { log "ERROR: $*" >&2; exit 1; }
+
+# Return all candidate block devices (optical + removable USB mass storage)
+find_candidates() {
+    # CD/DVD drives
+    for dev in /dev/sr* /dev/scd*; do
+        [ -b "$dev" ] && echo "$dev"
+    done
+    # USB/removable disks and partitions
+    for dev in /dev/sd* /dev/vd*; do
+        [ -b "$dev" ] || continue
+        # Only whole disks or partitions — skip the same device we are running from
+        local removable
+        local base
+        base=$(basename "$dev")
+        removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
+        [ "$removable" = "1" ] && echo "$dev"
+    done
+}
+
+# Try to mount $1 to $MEDIUM_DIR and check for squashfs
+try_mount() {
+    local dev="$1"
+    local tmpdir
+    tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
+    if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
+        if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
+            # Unmount probe mount and mount properly onto live path
+            umount "$tmpdir" 2>/dev/null || true
+            rmdir "$tmpdir"  2>/dev/null || true
+            # Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
+            umount "$MEDIUM_DIR" 2>/dev/null || true
+            mkdir -p "$MEDIUM_DIR"
+            if mount -o ro "$dev" "$MEDIUM_DIR"; then
+                log "Mounted $dev on $MEDIUM_DIR"
+                return 0
+            else
+                log "Mount of $dev on $MEDIUM_DIR failed"
+                return 1
+            fi
+        fi
+        umount "$tmpdir" 2>/dev/null || true
+    fi
+    rmdir "$tmpdir" 2>/dev/null || true
+    return 1
+}
+
+attempt() {
+    log "Scanning for ISO medium..."
+    for dev in $(find_candidates); do
+        log "  Trying $dev ..."
+        if try_mount "$dev"; then
+            local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
+            log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
+            return 0
+        fi
+    done
+    return 1
+}
+
+if [ "$WAIT_MODE" = "1" ]; then
+    log "Waiting for live medium (press Ctrl+C to abort)..."
+    while true; do
+        if attempt; then
+            exit 0
+        fi
+        log "  Not found — retrying in 5 s (reconnect the disc now)"
+        sleep 5
+    done
+else
+    attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
+fi
--- a/Show More
+++ b/Show More