fix(boot): remove advanced options submenu

Keep only EASY-BEE and toram entries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
fix(boot): default to KMS + pci=realloc, drop nomodeset from main entries
2026-04-22 19:01:50 +03:00 · 2026-04-22 19:00:04 +03:00 · 2026-04-22 18:52:10 +03:00 · 2026-04-22 18:52:04 +03:00 · 2026-04-20 14:06:32 +03:00 · 2026-04-20 13:39:05 +03:00
75 changed files with 9346 additions and 3992 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 dist/
 iso/out/
 build-cache/
 audit/bee
--- a/audit/bee
+++ b/audit/bee
--- a/audit/go.mod
+++ b/audit/go.mod
@@ -5,22 +5,18 @@ go 1.25.0
 replace reanimator/chart => ../internal/chart
 require (
-	github.com/go-analyze/charts v0.5.26
+	modernc.org/sqlite v1.48.0
 	reanimator/chart v0.0.0-00010101000000-000000000000
 )
 require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/go-analyze/bulk v0.1.3 // indirect
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
 	golang.org/x/image v0.24.0 // indirect
 	golang.org/x/sys v0.42.0 // indirect
-	modernc.org/libc v1.70.0 // indirect
+	modernc.org/libc v1.72.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
 	modernc.org/sqlite v1.48.0 // indirect
 )
--- a/audit/go.sum
+++ b/audit/go.sum
@@ -1,37 +1,51 @@
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
-github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
 github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
 github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
 github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
-golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
-golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
 golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
-modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
+modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
-modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
 modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
 modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
 modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
 modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
 modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
 modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
 modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
 modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
 modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
 modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
 modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
 modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
 modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
 modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
 modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
 modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
 modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
 modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
 modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
 modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
 modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
 modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
 modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
 modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
 modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
 modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,20 +19,22 @@ import (
 )
 var (
-	DefaultExportDir        = "/appdata/bee/export"
+	DefaultExportDir                     = "/appdata/bee/export"
-	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditJSONPath                 = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
+	DefaultAuditLogPath                  = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
+	DefaultWebLogPath                    = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
+	DefaultNetworkLogPath                = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
+	DefaultNvidiaLogPath                 = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultSSHLogPath                    = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeJSONPath               = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
+	DefaultRuntimeLogPath                = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
+	DefaultTechDumpDir                   = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
+	DefaultSATBaseDir                    = DefaultExportDir + "/bee-sat"
-	DefaultBeeBenchBaseDir  = DefaultExportDir + "/bee-bench"
+	DefaultBeeBenchBaseDir               = DefaultExportDir + "/bee-bench"
-	DefaultBeeBenchPerfDir  = DefaultBeeBenchBaseDir + "/perf"
+	DefaultBeeBenchAutotuneDir           = DefaultBeeBenchBaseDir + "/autotune"
-	DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
+	DefaultBeeBenchPerfDir               = DefaultBeeBenchBaseDir + "/perf"
 	DefaultBeeBenchPowerDir              = DefaultBeeBenchBaseDir + "/power"
 	DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
 )
 type App struct {
@@ -125,6 +127,7 @@ type satRunner interface {
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -146,7 +149,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }
 type runtimeChecker interface {
@@ -304,7 +307,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	}
 	filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
 	tmpPath := filepath.Join(os.TempDir(), filename)
-	data, err := os.ReadFile(DefaultAuditJSONPath)
+	data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
 	if err != nil {
 		return "", err
 	}
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPerfDir
 	}
 	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
 	if err != nil {
 		return "", err
 	}
 	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPowerDir
 	}
 	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
 	if err != nil {
 		return "", err
 	}
 	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchAutotuneDir
 	}
 	return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
 }
 func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
 	return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
 }
 func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
 	cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
 	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
 		if logFunc != nil {
 			logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
 		}
 		return *cfg, nil
 	}
 	if logFunc != nil {
 		logFunc("benchmark autotune: no saved power source config, running autotune first")
 	}
 	autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
 	if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
 		return platform.BenchmarkPowerAutotuneConfig{}, err
 	}
 	cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
 	if err != nil {
 		return platform.BenchmarkPowerAutotuneConfig{}, err
 	}
 	return *cfg, nil
 }
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -744,8 +790,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
 }
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -9,6 +9,7 @@ import (
 	"io"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"bee/audit/internal/platform"
@@ -123,11 +124,13 @@ type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaAutotuneFn       func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
 	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -162,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
 	if f.runNvidiaAutotuneFn != nil {
 		return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -287,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }
-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNCCLFn != nil {
 		return f.runNCCLFn(baseDir, gpuIndices)
 	}
 	return "", nil
 }
 func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
 	t.Parallel()
 	var gotBaseDir string
 	var gotGPUIndices []int
 	a := &App{
 		sat: fakeSAT{
 			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
 				gotBaseDir = baseDir
 				gotGPUIndices = append([]int(nil), gpuIndices...)
 				return "/tmp/nccl-tests.tar.gz", nil
 			},
 		},
 	}
 	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
 	if err != nil {
 		t.Fatalf("RunNCCLTests error: %v", err)
 	}
 	if path != "/tmp/nccl-tests.tar.gz" {
 		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
 	}
 	if gotBaseDir != "/tmp/sat" {
 		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
 	}
 	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
 		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
 	}
 }
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
@@ -775,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -802,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tr := tar.NewReader(gzr)
 	var names []string
 	var auditJSON string
 	var manifest string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -818,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			}
 			auditJSON = string(body)
 		}
 		if strings.HasSuffix(hdr.Name, "/manifest.txt") {
 			body, err := io.ReadAll(tr)
 			if err != nil {
 				t.Fatalf("read manifest entry: %v", err)
 			}
 			manifest = string(body)
 		}
 	}
 	for _, want := range []string{
@@ -861,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
 		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
 	}
 	if !contains(manifest, "files:") {
 		t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
 	}
 	if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
 		t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
 	}
 }
 func TestMainBanner(t *testing.T) {
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -2,10 +2,29 @@ package app
 import (
 	"fmt"
 	"io"
 	"os"
 	"path/filepath"
 )
 // readFileLimited reads path into memory, refusing files larger than maxBytes.
 // Prevents OOM on corrupted or unexpectedly large data files.
 func readFileLimited(path string, maxBytes int64) ([]byte, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
 	if err != nil {
 		return nil, err
 	}
 	if int64(len(data)) > maxBytes {
 		return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
 	}
 	return data, nil
 }
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
-	data, err := os.ReadFile(path)
+	data, err := readFileLimited(path, 10<<20)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -2,6 +2,7 @@ package app
 import (
 	"archive/tar"
 	"bee/audit/internal/platform"
 	"compress/gzip"
 	"fmt"
 	"io"
@@ -22,6 +23,8 @@ var supportBundleServices = []string{
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 	"nvidia-dcgm.service",
 	"nvidia-fabricmanager.service",
 }
 var supportBundleCommands = []struct {
@@ -48,6 +51,43 @@ else
 fi
 `}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
 	{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
 if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi topo -m 2>&1 || true
 else
  echo "nvidia-smi not found"
 fi
 `}},
 	{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
 if ! command -v systemctl >/dev/null 2>&1; then
  echo "systemctl not found"
  exit 0
 fi
 echo "=== unit files ==="
 systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
 echo
 echo "=== active units ==="
 systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
 echo
 echo "=== failed units ==="
 systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
 `}},
 	{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
 for candidate in \
  /usr/bin/nvidia-fabricmanager \
  /usr/bin/nv-fabricmanager \
  /usr/bin/nvidia-fabricmanagerd \
  /usr/bin/nvlsm; do
  if [ -e "$candidate" ]; then
    echo "=== $candidate ==="
    ls -l "$candidate" 2>&1 || true
    echo
  fi
 done
 if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
  echo "no fabric manager binaries found"
 fi
 `}},
 	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
 if ! command -v lspci >/dev/null 2>&1; then
  echo "lspci not found"
@@ -195,6 +235,10 @@ var supportBundleOptionalFiles = []struct {
 }{
 	{name: "system/kern.log", src: "/var/log/kern.log"},
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
 	{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
 	{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
 	{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
 	{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
 }
 const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
@@ -381,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
 	fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
 	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
 		fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
 		fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
 		if strings.TrimSpace(cfg.Reason) != "" {
 			fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
 		}
 	}
 	fmt.Fprintf(&body, "\nfiles:\n")
 	var files []string
--- a/audit/internal/collector/psu.go
+++ b/audit/internal/collector/psu.go
@@ -160,11 +160,57 @@ type psuSDR struct {
 }
 var psuSlotPatterns = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
+	// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
-	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
+	// does not fire after the digit; match explicitly with underscore terminator.
-	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
-	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),                    // PSU1, PS1, ps 2
-	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),                      // PS 6, PS6
 	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),                     // PWS1
 	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
 	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),                     // Bay 1
 	// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
 	// Must be last: "power supply N" is already caught by the pattern above.
 	regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
 }
 // psuInputPowerKeywords matches AC-input power sensor names across vendors:
 //   MSI:     PSU1_POWER_IN, PSU1_PIN
 //   MLT:     PSU1_PIN
 //   xFusion: (matched via default fallback — no explicit keyword)
 //   HPE:     PS1 Input Power, PS1 Input Watts
 func isPSUInputPower(name string) bool {
 	return strings.Contains(name, "input power") ||
 		strings.Contains(name, "input watts") ||
 		strings.Contains(name, "_pin") ||
 		strings.Contains(name, " pin") ||
 		strings.Contains(name, "_power_in") ||
 		strings.Contains(name, "power_in")
 }
 // isPSUOutputPower matches DC-output power sensor names across vendors:
 //   MSI:     PSU1_POWER_OUT
 //   MLT:     PSU1_POUT
 //   xFusion: PS1 POut
 func isPSUOutputPower(name string) bool {
 	return strings.Contains(name, "output power") ||
 		strings.Contains(name, "output watts") ||
 		strings.Contains(name, "_pout") ||
 		strings.Contains(name, " pout") ||
 		strings.Contains(name, "_power_out") ||
 		strings.Contains(name, "power_out") ||
 		strings.Contains(name, "power supply bay") ||
 		strings.Contains(name, "psu bay")
 }
 // parseBoundedFloat parses a numeric value from an SDR value field and
 // validates it is within (0, max]. Returns nil for zero, negative, or
 // out-of-range values — these indicate missing/off/fault sensor readings.
 func parseBoundedFloat(raw string, max float64) *float64 {
 	v := parseFloatPtr(raw)
 	if v == nil || *v <= 0 || *v > max {
 		return nil
 	}
 	return v
 }
 func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +240,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
 		lowerName := strings.ToLower(name)
 		switch {
-		case strings.Contains(lowerName, "input power"):
+		case isPSUInputPower(lowerName):
-			entry.inputPowerW = parseFloatPtr(value)
+			entry.inputPowerW = parseBoundedFloat(value, 6000)
-		case strings.Contains(lowerName, "output power"):
+		case isPSUOutputPower(lowerName):
-			entry.outputPowerW = parseFloatPtr(value)
+			entry.outputPowerW = parseBoundedFloat(value, 6000)
 		case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
 			entry.outputPowerW = parseFloatPtr(value)
 		case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
 			entry.inputVoltage = parseFloatPtr(value)
 		case strings.Contains(lowerName, "temp"):
 			entry.temperatureC = parseFloatPtr(value)
 		case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
 			entry.healthPct = parsePercentPtr(value)
 		default:
 			// Generic PSU power reading: sensor matched a slot pattern but carries
 			// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
 			// AC input if the value looks like wattage and no better data is set yet.
 			if entry.inputPowerW == nil {
 				entry.inputPowerW = parseBoundedFloat(value, 6000)
 			}
 		}
 		out[slot] = entry
 	}
 	return out
 }
 // PSUSlotPower holds SDR power readings for one PSU slot.
 // Slot key used by PSUSlotsFromSDR is the 0-based index string,
 // matching HardwarePowerSupply.Slot in the audit schema.
 type PSUSlotPower struct {
 	InputW  *float64 `json:"input_w,omitempty"`
 	OutputW *float64 `json:"output_w,omitempty"`
 	Status  string   `json:"status,omitempty"`
 }
 // PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
 // using the same battle-tested slot patterns as the hardware audit collector.
 // Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
 // Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
 func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
 	sdr := parsePSUSDR(sdrOutput)
 	if len(sdr) == 0 {
 		return nil
 	}
 	out := make(map[string]PSUSlotPower, len(sdr))
 	for slot, entry := range sdr {
 		key := strconv.Itoa(slot - 1) // audit uses 0-based slot
 		out[key] = PSUSlotPower{
 			InputW:  entry.inputPowerW,
 			OutputW: entry.outputPowerW,
 			Status:  entry.status,
 		}
 	}
 	return out
 }
 func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
 	if len(sdr) == 0 {
 		return nil
--- a/audit/internal/collector/psu_sdr_test.go
+++ b/audit/internal/collector/psu_sdr_test.go
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 		{name: "PWS1 Status", want: 1},
 		{name: "Power Supply Bay 8", want: 8},
 		{name: "PS 6 Input Power", want: 6},
 		// MSI underscore format — \b does not fire between digit and '_'
 		{name: "PSU1_POWER_IN", want: 1},
 		{name: "PSU2_POWER_OUT", want: 2},
 		{name: "PSU4_STATUS", want: 4},
 	}
 	for _, tt := range tests {
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 	}
 }
 func TestParsePSUSDRMSIFormat(t *testing.T) {
 	t.Parallel()
 	raw := `
 PSU1_STATUS      | F1h | ok
 PSU1_POWER_OUT   | 928 Watts | ok
 PSU1_POWER_IN    | 976 Watts | ok
 PSU2_STATUS      | F2h | ok
 PSU2_POWER_OUT   | 944 Watts | ok
 PSU2_POWER_IN    | 992 Watts | ok
 `
 	got := parsePSUSDR(raw)
 	if len(got) != 2 {
 		t.Fatalf("len(got)=%d want 2", len(got))
 	}
 	if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
 		t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
 	}
 	if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
 		t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
 	}
 	if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
 		t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
 	}
 }
 func TestSynthesizePSUsFromSDR(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_power_autotune.go
+++ b/audit/internal/platform/benchmark_power_autotune.go
@@ -0,0 +1,735 @@
 package platform
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"sort"
 	"strings"
 	"time"
 )
 const (
 	benchmarkPowerAutotuneVersion         = 1
 	benchmarkPowerAutotuneIdleSec         = 60
 	benchmarkPowerAutotuneLoadSec         = 90
 	benchmarkPowerAutotuneSampleInterval  = 3
 	defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
 )
 func BenchmarkPowerSourceConfigPath(baseDir string) string {
 	baseDir = strings.TrimSpace(baseDir)
 	if baseDir == "" {
 		return defaultBenchmarkPowerSourceConfigPath
 	}
 	return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
 }
 func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		return nil, err
 	}
 	var cfg BenchmarkPowerAutotuneConfig
 	if err := json.Unmarshal(raw, &cfg); err != nil {
 		return nil, err
 	}
 	if strings.TrimSpace(cfg.SelectedSource) == "" {
 		return nil, fmt.Errorf("autotune config missing selected_source")
 	}
 	return &cfg, nil
 }
 func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
 	if strings.TrimSpace(path) == "" {
 		return fmt.Errorf("empty autotune config path")
 	}
 	if cfg.Version <= 0 {
 		cfg.Version = benchmarkPowerAutotuneVersion
 	}
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return err
 	}
 	data, err := json.MarshalIndent(cfg, "", "  ")
 	if err != nil {
 		return err
 	}
 	tmp := path + ".tmp"
 	if err := os.WriteFile(tmp, data, 0644); err != nil {
 		return err
 	}
 	return os.Rename(tmp, path)
 }
 func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
 	return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
 }
 func ResetBenchmarkPowerAutotuneConfig(path string) error {
 	if strings.TrimSpace(path) == "" {
 		return fmt.Errorf("empty autotune config path")
 	}
 	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
 		return err
 	}
 	return nil
 }
 func normalizeBenchmarkPowerSource(source string) string {
 	switch strings.TrimSpace(strings.ToLower(source)) {
 	case BenchmarkPowerSourceSDRPSUInput:
 		return BenchmarkPowerSourceSDRPSUInput
 	default:
 		return BenchmarkPowerSourceDCMI
 	}
 }
 func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
 	cfg, err := LoadSystemPowerSourceConfig(exportDir)
 	if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
 		selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
 		return SystemPowerSourceDecision{
 			Configured:      true,
 			SelectedSource:  selected,
 			EffectiveSource: selected,
 			Mode:            "autotuned",
 			Reason:          strings.TrimSpace(cfg.Reason),
 			ConfiguredAt:    cfg.UpdatedAt,
 		}
 	}
 	sources := sampleBenchmarkPowerSources()
 	if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
 		return SystemPowerSourceDecision{
 			Configured:      false,
 			EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
 			Mode:            "fallback",
 			Reason:          "autotune config not found; using temporary fallback source sdr_psu_input",
 		}
 	}
 	return SystemPowerSourceDecision{
 		Configured:      false,
 		EffectiveSource: BenchmarkPowerSourceDCMI,
 		Mode:            "fallback",
 		Reason:          "autotune config not found; using temporary fallback source dcmi",
 	}
 }
 func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
 	decision := ResolveSystemPowerDecision(exportDir)
 	if decision.EffectiveSource != "" {
 		if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
 			return value, decision, nil
 		} else if decision.Configured {
 			fallback := BenchmarkPowerSourceDCMI
 			if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
 				fallback = BenchmarkPowerSourceSDRPSUInput
 			}
 			if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
 				decision.Mode = "degraded"
 				decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
 				decision.EffectiveSource = fallback
 				return fallbackValue, decision, nil
 			}
 			decision.Mode = "degraded"
 			decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
 			return 0, decision, err
 		}
 	}
 	return 0, decision, fmt.Errorf("system power source unavailable")
 }
 func queryBenchmarkPowerSourceW(source string) (float64, error) {
 	switch normalizeBenchmarkPowerSource(source) {
 	case BenchmarkPowerSourceSDRPSUInput:
 		sdr := sampleIPMISDRPowerSensors()
 		if sdr.PSUInW > 0 {
 			return sdr.PSUInW, nil
 		}
 		return 0, fmt.Errorf("sdr psu input unavailable")
 	default:
 		return queryIPMIServerPowerW()
 	}
 }
 func sampleBenchmarkPowerSources() map[string]float64 {
 	out := map[string]float64{}
 	if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
 		out[BenchmarkPowerSourceDCMI] = w
 	}
 	if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
 		out[BenchmarkPowerSourceSDRPSUInput] = w
 	}
 	return out
 }
 func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
 	if durationSec <= 0 {
 		return 0, false
 	}
 	samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
 	if durationSec <= 0 {
 		return nil
 	}
 	stopCh := make(chan struct{})
 	doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
 	select {
 	case <-ctx.Done():
 	case <-time.After(time.Duration(durationSec) * time.Second):
 	}
 	close(stopCh)
 	return <-doneCh
 }
 func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
 	if intervalSec <= 0 {
 		intervalSec = benchmarkPowerAutotuneSampleInterval
 	}
 	ch := make(chan []float64, 1)
 	go func() {
 		defer close(ch)
 		var samples []float64
 		record := func() {
 			if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
 				samples = append(samples, w)
 			}
 		}
 		record()
 		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-stopCh:
 				ch <- samples
 				return
 			case <-ticker.C:
 				record()
 			}
 		}
 	}()
 	return ch
 }
 type benchmarkPowerAutotuneSample struct {
 	ElapsedSec     float64
 	GPUAvgUsagePct float64
 	CPUUsagePct    float64
 	GPUSumPowerW   float64
 	Sources        map[string]float64
 }
 func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
 	if durationSec <= 0 {
 		return nil
 	}
 	var out []benchmarkPowerAutotuneSample
 	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
 	start := time.Now()
 	for {
 		if ctx.Err() != nil {
 			return out
 		}
 		row := benchmarkPowerAutotuneSample{
 			ElapsedSec:  time.Since(start).Seconds(),
 			CPUUsagePct: sampleCPULoadPct(),
 			Sources:     sampleBenchmarkPowerSources(),
 		}
 		if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
 			var usageSum float64
 			for _, gpu := range gpuRows {
 				row.GPUSumPowerW += gpu.PowerW
 				usageSum += gpu.UsagePct
 			}
 			row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
 		}
 		out = append(out, row)
 		logBenchmarkPowerAutotuneSample(phase, row, logFunc)
 		if time.Now().After(deadline) {
 			return out
 		}
 		select {
 		case <-ctx.Done():
 			return out
 		case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
 		}
 	}
 }
 func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
 	if logFunc == nil {
 		return
 	}
 	var sourceParts []string
 	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
 		if value, ok := sample.Sources[source]; ok && value > 0 {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
 		} else {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
 		}
 	}
 	logFunc(fmt.Sprintf(
 		"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
 		phase,
 		sample.ElapsedSec,
 		sample.GPUAvgUsagePct,
 		sample.GPUSumPowerW,
 		sample.CPUUsagePct,
 		strings.Join(sourceParts, " "),
 	))
 }
 func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
 	if logFunc == nil || len(samples) == 0 {
 		return
 	}
 	var gpuUsage []float64
 	var cpuUsage []float64
 	var gpuPower []float64
 	sourceBuckets := map[string][]float64{}
 	for _, sample := range samples {
 		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
 		cpuUsage = append(cpuUsage, sample.CPUUsagePct)
 		gpuPower = append(gpuPower, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				sourceBuckets[source] = append(sourceBuckets[source], value)
 			}
 		}
 	}
 	var sourceParts []string
 	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
 		values := sourceBuckets[source]
 		if len(values) == 0 {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
 			continue
 		}
 		sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
 	}
 	logFunc(fmt.Sprintf(
 		"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
 		phase,
 		len(samples),
 		benchmarkMean(gpuUsage),
 		benchmarkPercentile(gpuUsage, 95),
 		benchmarkMean(gpuPower),
 		benchmarkMean(cpuUsage),
 		benchmarkPercentile(cpuUsage, 95),
 		strings.Join(sourceParts, " "),
 	))
 }
 func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
 	if logFunc == nil {
 		return
 	}
 	for _, candidate := range candidates {
 		if !candidate.Available {
 			logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
 			continue
 		}
 		logFunc(fmt.Sprintf(
 			"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
 			candidate.Source,
 			candidate.IdleAvgW,
 			candidate.LoadAvgW,
 			candidate.DeltaW,
 			gpuDelta,
 			candidate.RelativeError,
 			candidate.Confidence*100,
 			map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
 		))
 		if strings.TrimSpace(candidate.SelectionNotes) != "" {
 			logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
 		}
 	}
 }
 func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
 	result := &BenchmarkPowerAutotuneValidation{}
 	if len(samples) == 0 {
 		result.Reason = "no idle telemetry samples collected"
 		return result
 	}
 	var gpuUsage []float64
 	var cpuUsage []float64
 	for _, sample := range samples {
 		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
 		if sample.CPUUsagePct > 0 {
 			cpuUsage = append(cpuUsage, sample.CPUUsagePct)
 		}
 	}
 	result.GPUSamples = len(gpuUsage)
 	result.CPUSamples = len(cpuUsage)
 	result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
 	result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
 	result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
 	result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
 	switch {
 	case result.GPUAvgUsagePct > 5:
 		result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
 	case result.GPUP95UsagePct > 10:
 		result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
 	case result.CPUAvgUsagePct > 20:
 		result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
 	case result.CPUP95UsagePct > 35:
 		result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
 	default:
 		result.Valid = true
 	}
 	return result
 }
 func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
 	idleBySource := map[string][]float64{}
 	loadBySource := map[string][]float64{}
 	var idleGPU []float64
 	var loadGPU []float64
 	for _, sample := range idle {
 		idleGPU = append(idleGPU, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				idleBySource[source] = append(idleBySource[source], value)
 			}
 		}
 	}
 	for _, sample := range load {
 		loadGPU = append(loadGPU, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				loadBySource[source] = append(loadBySource[source], value)
 			}
 		}
 	}
 	idleGPUAvg := benchmarkMean(idleGPU)
 	loadGPUAvg := benchmarkMean(loadGPU)
 	gpuDelta := loadGPUAvg - idleGPUAvg
 	if gpuDelta <= 0 {
 		gpuDelta = loadGPUAvg
 	}
 	candidates := []BenchmarkPowerAutotuneCandidate{
 		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
 		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
 	}
 	available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
 	for _, candidate := range candidates {
 		if candidate.Available && candidate.DeltaW > 0 {
 			available = append(available, candidate)
 		}
 	}
 	if len(available) == 0 {
 		return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
 	}
 	sort.Slice(available, func(i, j int) bool {
 		if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
 			if available[i].Source != available[j].Source {
 				return available[i].Source == BenchmarkPowerSourceSDRPSUInput
 			}
 		}
 		if available[i].RelativeError != available[j].RelativeError {
 			return available[i].RelativeError < available[j].RelativeError
 		}
 		return available[i].Samples > available[j].Samples
 	})
 	selected := available[0]
 	for idx := range candidates {
 		if candidates[idx].Source == selected.Source {
 			candidates[idx].Selected = true
 			candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
 		}
 	}
 	return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
 }
 func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
 	candidate := BenchmarkPowerAutotuneCandidate{
 		Source:    source,
 		Available: len(idle) > 0 && len(load) > 0,
 		Samples:   minInt(len(idle), len(load)),
 	}
 	if !candidate.Available {
 		return candidate
 	}
 	candidate.IdleAvgW = benchmarkMean(idle)
 	candidate.LoadAvgW = benchmarkMean(load)
 	candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
 	if gpuDelta > 0 {
 		candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
 		candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
 	}
 	return candidate
 }
 func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
 	fmt.Fprintf(&b, "status=%s\n", result.Status)
 	fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
 	fmt.Fprintf(&b, "profile=%s\n", result.Profile)
 	fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
 	fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
 	fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
 	if result.SelectedSource != "" {
 		fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
 	}
 	if result.IdleValidation != nil {
 		fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
 		fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
 		fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
 		fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
 		fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
 		if result.IdleValidation.Reason != "" {
 			fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
 		}
 	}
 	for _, candidate := range result.Candidates {
 		fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
 		if candidate.Available {
 			fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
 			fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
 			fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
 			fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
 		}
 	}
 	return b.String()
 }
 func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
 	var b strings.Builder
 	b.WriteString("# Bee Bench Power Source Autotune\n\n")
 	fmt.Fprintf(&b, "**Status:** %s  \n", result.Status)
 	fmt.Fprintf(&b, "**Benchmark kind:** %s  \n", result.BenchmarkKind)
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.Profile)
 	fmt.Fprintf(&b, "**Idle window:** %ds  \n", result.IdleDurationSec)
 	fmt.Fprintf(&b, "**Load window:** %ds  \n", result.LoadDurationSec)
 	fmt.Fprintf(&b, "**Sample interval:** %ds  \n", result.SampleIntervalSec)
 	if result.SelectedSource != "" {
 		fmt.Fprintf(&b, "**Selected source:** `%s`  \n", result.SelectedSource)
 	}
 	b.WriteString("\n")
 	if result.IdleValidation != nil {
 		b.WriteString("## Idle Validation\n\n")
 		fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
 		fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
 		fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
 		fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
 		fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
 		if result.IdleValidation.Reason != "" {
 			fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.Candidates) > 0 {
 		b.WriteString("## Candidates\n\n")
 		b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
 		b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
 		for _, candidate := range result.Candidates {
 			if !candidate.Available {
 				fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
 				continue
 			}
 			selected := "no"
 			if candidate.Selected {
 				selected = "yes"
 			}
 			fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
 				candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
 		}
 		b.WriteString("\n")
 	}
 	for _, note := range result.Notes {
 		fmt.Fprintf(&b, "- %s\n", note)
 	}
 	return b.String()
 }
 func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
 	allDevices := joinIndexList(gpuIndices)
 	switch strings.TrimSpace(strings.ToLower(kind)) {
 	case "power-fit", "power", "nvidia-bench-power":
 		cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
 		if err == nil {
 			return cmd, "power-fit"
 		}
 		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
 	default:
 		cmd := []string{
 			"bee-gpu-burn",
 			"--seconds", fmt.Sprintf("%d", durationSec),
 			"--devices", allDevices,
 		}
 		if sizeMB > 0 {
 			cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
 		}
 		return cmd, "performance"
 	}
 }
 func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if logFunc == nil {
 		logFunc = func(string) {}
 	}
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = "/var/log/bee-bench/autotune"
 	}
 	if err := os.MkdirAll(baseDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
 	}
 	selected, err := resolveNvidiaGPUSelection(nil, nil)
 	if err != nil {
 		return "", err
 	}
 	if len(selected) == 0 {
 		return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
 	}
 	ts := time.Now().UTC().Format("20060102-150405")
 	runDir := filepath.Join(baseDir, "autotune-"+ts)
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
 	verboseLog := filepath.Join(runDir, "verbose.log")
 	hostname, _ := os.Hostname()
 	loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
 	result := BenchmarkPowerAutotuneResult{
 		GeneratedAt:       time.Now().UTC(),
 		Hostname:          hostname,
 		ServerModel:       readServerModel(),
 		BenchmarkKind:     normalizedKind,
 		Profile:           opts.Profile,
 		Status:            "FAILED",
 		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
 		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
 		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
 	}
 	logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
 	idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
 	logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
 	result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
 	if result.IdleValidation == nil || !result.IdleValidation.Valid {
 		if result.IdleValidation != nil {
 			result.IdleValidationError = result.IdleValidation.Reason
 			logFunc(result.IdleValidation.Reason)
 		}
 		result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, fmt.Errorf("%s", result.IdleValidationError)
 	}
 	logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
 	loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
 	go func() {
 		loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
 	}()
 	out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
 	loadSamples := <-loadSamplesCh
 	logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
 	if runErr != nil {
 		result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, fmt.Errorf("autotune load stage: %w", runErr)
 	}
 	selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
 	result.Candidates = candidates
 	result.GPUPowerIdleW = idleGPUAvg
 	result.GPUPowerLoadW = loadGPUAvg
 	if chooseErr != nil {
 		result.Notes = append(result.Notes, chooseErr.Error())
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, chooseErr
 	}
 	gpuDelta := loadGPUAvg - idleGPUAvg
 	if gpuDelta <= 0 {
 		gpuDelta = loadGPUAvg
 	}
 	logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
 	result.SelectedSource = selectedSource
 	result.Status = "OK"
 	var confidence float64
 	selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
 	for _, candidate := range candidates {
 		if candidate.Selected {
 			confidence = candidate.Confidence
 			if strings.TrimSpace(candidate.SelectionNotes) != "" {
 				selectionReason = candidate.SelectionNotes
 			}
 			break
 		}
 	}
 	cfg := BenchmarkPowerAutotuneConfig{
 		Version:           benchmarkPowerAutotuneVersion,
 		UpdatedAt:         time.Now().UTC(),
 		SelectedSource:    selectedSource,
 		BenchmarkKind:     normalizedKind,
 		Profile:           opts.Profile,
 		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
 		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
 		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
 		Confidence:        confidence,
 		Reason:            selectionReason,
 	}
 	result.Config = &cfg
 	configPath := BenchmarkPowerSourceConfigPath(baseDir)
 	if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
 		result.Status = "FAILED"
 		result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
 		if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
 			return "", writeErr
 		}
 		return runDir, err
 	}
 	logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
 	result.Notes = append(result.Notes, "saved autotune config to "+configPath)
 	if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 		return "", err
 	}
 	return runDir, nil
 }
 func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return fmt.Errorf("marshal autotune result: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
 		return fmt.Errorf("write autotune result.json: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
 		return fmt.Errorf("write autotune summary.txt: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
 		return fmt.Errorf("write autotune report.md: %w", err)
 	}
 	return nil
 }
 func minInt(a, b int) int {
 	if a < b {
 		return a
 	}
 	return b
 }
 var _ = exec.ErrNotFound
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
 	if result.PlatformPowerScore > 0 {
 		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
 	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")
@@ -81,69 +84,164 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 	}
-	// ── Methodology ───────────────────────────────────────────────────────────
+	// ── Balanced Scorecard ────────────────────────────────────────────────────
-	b.WriteString("## Methodology\n\n")
+	b.WriteString("## Balanced Scorecard\n\n")
 	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
 	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
 	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
 	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
 	b.WriteString("**Compute score** is derived from two phases:\n\n")
 	b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
 	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
 	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
 	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
 	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
 	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
 	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
 	b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
 	b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
 	b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
-	// ── Scorecard table ───────────────────────────────────────────────────────
+	// Perspective 1: Compatibility — hard stops
-	b.WriteString("## Scorecard\n\n")
+	b.WriteString("### 1. Compatibility\n\n")
-	b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
+	{
-	b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
+		var rows [][]string
-	for _, gpu := range result.GPUs {
+		for _, gpu := range result.GPUs {
-		name := strings.TrimSpace(gpu.Name)
+			thermalThrottle := "-"
-		if name == "" {
+			if gpu.Scores.ThermalThrottlePct > 0 {
-			name = "Unknown GPU"
+				thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
 			}
 			fanAtThrottle := "-"
 			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
 				fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
 			}
 			ecc := "-"
 			if gpu.ECC.Uncorrected > 0 {
 				ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
 			}
 			compatStatus := "✓ OK"
 			if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
 				compatStatus = "⛔ HARD STOP"
 			}
 			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
 		}
-		interconnect := "-"
+		b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
-		if gpu.Scores.InterconnectScore > 0 {
+		b.WriteString("\n")
-			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
+	}
-		}
+
-		topsPerSM := "-"
+	// Perspective 2: Thermal headroom
-		if gpu.Scores.TOPSPerSMPerGHz > 0 {
+	b.WriteString("### 2. Thermal Headroom\n\n")
-			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+	{
-		}
+		var rows [][]string
-		synthetic := "-"
+		for _, gpu := range result.GPUs {
-		if gpu.Scores.SyntheticScore > 0 {
+			shutdownTemp := gpu.ShutdownTempC
-			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+			if shutdownTemp <= 0 {
-		}
+				shutdownTemp = 90
-		mixed := "-"
+			}
-		if gpu.Scores.MixedScore > 0 {
+			slowdownTemp := gpu.SlowdownTempC
-			mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+			if slowdownTemp <= 0 {
-		}
+				slowdownTemp = 80
-		mixedEff := "-"
+			}
-		if gpu.Scores.MixedEfficiency > 0 {
+			headroom := gpu.Scores.TempHeadroomC
-			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
+			thermalStatus := "✓ OK"
-		}
+			switch {
-		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
+			case headroom < 10:
-			gpu.Index, name,
+				thermalStatus = "⛔ CRITICAL"
-			gpu.Status,
+			case gpu.Steady.P95TempC >= slowdownTemp:
-			gpu.Scores.CompositeScore,
+				thermalStatus = "⚠ WARNING"
-			gpu.Scores.ComputeScore,
+			}
-			synthetic,
+			throttlePct := "-"
-			mixed,
+			if gpu.Scores.ThermalThrottlePct > 0 {
-			mixedEff,
+				throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
-			topsPerSM,
+			}
-			gpu.Scores.PowerSustainScore,
+			rows = append(rows, []string{
-			gpu.Scores.ThermalSustainScore,
+				fmt.Sprintf("GPU %d", gpu.Index),
-			gpu.Scores.StabilityScore,
+				fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
-			interconnect,
+				fmt.Sprintf("%.0f°C", slowdownTemp),
-		)
+				fmt.Sprintf("%.0f°C", shutdownTemp),
 				fmt.Sprintf("%.1f°C", headroom),
 				throttlePct,
 				thermalStatus,
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
 		b.WriteString("\n")
 	}
 	// Perspective 3: Power delivery
 	b.WriteString("### 3. Power Delivery\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			powerCap := "-"
 			if gpu.Scores.PowerCapThrottlePct > 0 {
 				powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
 			}
 			fanDuty := "-"
 			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
 				fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
 			}
 			powerStatus := "✓ OK"
 			if gpu.Scores.PowerCapThrottlePct > 5 {
 				powerStatus = "⚠ POWER LIMITED"
 			}
 			rows = append(rows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
 				powerCap,
 				fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
 				fanDuty,
 				powerStatus,
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
 		b.WriteString("\n")
 	}
 	// Perspective 4: Performance
 	b.WriteString("### 4. Performance\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			synthetic := "-"
 			if gpu.Scores.SyntheticScore > 0 {
 				synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
 			}
 			mixed := "-"
 			if gpu.Scores.MixedScore > 0 {
 				mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
 			}
 			mixedEff := "-"
 			if gpu.Scores.MixedEfficiency > 0 {
 				mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
 			}
 			topsPerSM := "-"
 			if gpu.Scores.TOPSPerSMPerGHz > 0 {
 				topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
 			}
 			rows = append(rows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
 				fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
 				synthetic, mixed, mixedEff, topsPerSM,
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
 		if len(result.PerformanceRampSteps) > 0 {
 			fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
 		}
 		b.WriteString("\n")
 	}
 	// Perspective 5: Anomaly flags
 	b.WriteString("### 5. Anomalies\n\n")
 	{
 		var rows [][]string
 		for _, gpu := range result.GPUs {
 			eccCorr := "-"
 			if gpu.ECC.Corrected > 0 {
 				eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
 			}
 			syncBoost := "-"
 			if gpu.Scores.SyncBoostThrottlePct > 0 {
 				syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
 			}
 			powerVar := "OK"
 			if gpu.Scores.PowerSustainScore < 70 {
 				powerVar = "⚠ unstable"
 			}
 			thermalVar := "OK"
 			if gpu.Scores.ThermalSustainScore < 70 {
 				thermalVar = "⚠ unstable"
 			}
 			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
 		}
 		b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
 		b.WriteString("\n")
 	}
 	b.WriteString("\n")
 	// ── Per GPU detail ────────────────────────────────────────────────────────
 	b.WriteString("## Per-GPU Details\n\n")
@@ -171,13 +269,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
-			fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
+			fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
@@ -186,19 +284,27 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 		// Steady-state telemetry
-		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+		if benchmarkTelemetryAvailable(gpu.Steady) {
-		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+			b.WriteString(fmtMDTable(
-		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+				[]string{"", "Avg", "P95"},
-		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+				[][]string{
-		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+					{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
-		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+					{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
-		b.WriteString("\n")
+					{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
 					{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
 					{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
 				},
 			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
 		}
 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
-			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
+			var precRows [][]string
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
@@ -210,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
-				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
+				precRows = append(precRows, []string{
-					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
+					p.Precision, status,
-					eccCorr, eccUncorr)
+					fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
 					fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
 					fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
 					eccCorr, eccUncorr,
 				})
 			}
 			b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
 			b.WriteString("\n")
 		} else {
 			// Legacy: show combined-window variance.
@@ -236,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Precision results
 		if len(gpu.PrecisionResults) > 0 {
 			b.WriteString("**Precision results:**\n\n")
-			b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
+			var presRows [][]string
 			for _, p := range gpu.PrecisionResults {
 				if p.Supported {
-					weightStr := fmt.Sprintf("×%.3g", p.Weight)
+					presRows = append(presRows, []string{
-					fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
+						p.Name,
-						p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
+						fmt.Sprintf("%.2f", p.TeraOpsPerSec),
 						fmt.Sprintf("×%.3g", p.Weight),
 						fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
 						fmt.Sprintf("%d", p.Lanes),
 						fmt.Sprintf("%d", p.Iterations),
 					})
 				} else {
-					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
+					presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
 				}
 			}
 			b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
 			b.WriteString("\n")
 		}
@@ -267,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("## Interconnect (NCCL)\n\n")
 		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
+			b.WriteString(fmtMDTable(
-			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
+				[]string{"Metric", "Avg", "Max"},
-			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+				[][]string{
 					{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
 					{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
 				},
 			))
 			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
@@ -280,20 +401,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}
-	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	// ── Server Power ───────────────────────────────────────────────────────────
 	if sp := result.ServerPower; sp != nil {
-		b.WriteString("## Server Power (IPMI)\n\n")
+		title := "## Server Power\n\n"
 		if sp.Source != "" {
 			title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
 		}
 		b.WriteString(title)
 		if !sp.Available {
-			b.WriteString("IPMI power measurement unavailable.\n\n")
+			b.WriteString("Server power measurement unavailable.\n\n")
 		} else {
-			b.WriteString("| | Value |\n|---|---|\n")
+			spRows := [][]string{
-			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
+				{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
-			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
+				{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
-			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
+				{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
-			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
+				{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
 			if sp.ReportingRatio > 0 {
 				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
 			}
 			if sp.ReportingRatio > 0 {
 				spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
 			}
 			b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
 			b.WriteString("\n")
 		}
 		for _, note := range sp.Notes {
@@ -304,19 +431,33 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}
 	// ── PSU Issues ────────────────────────────────────────────────────────────
 	if len(result.PSUIssues) > 0 {
 		b.WriteString("## PSU Issues\n\n")
 		b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
 		for _, issue := range result.PSUIssues {
 			fmt.Fprintf(&b, "- ⛔ %s\n", issue)
 		}
 		b.WriteString("\n")
 	}
 	// ── Cooling ───────────────────────────────────────────────────────────────
 	if cooling := result.Cooling; cooling != nil {
 		b.WriteString("## Cooling\n\n")
 		if cooling.Available {
-			b.WriteString("| Metric | Value |\n|--------|-------|\n")
+			dutyAvg, dutyP95 := "N/A", "N/A"
 			fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
 			if cooling.FanDutyCycleAvailable {
-				fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
+				dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
-				fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
+				dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
 			} else {
 				b.WriteString("| Average fan duty cycle | N/A |\n")
 				b.WriteString("| P95 fan duty cycle | N/A |\n")
 			}
 			b.WriteString(fmtMDTable(
 				[]string{"Metric", "Value"},
 				[][]string{
 					{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
 					{"Average fan duty cycle", dutyAvg},
 					{"P95 fan duty cycle", dutyP95},
 				},
 			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("Cooling telemetry unavailable.\n\n")
@@ -329,6 +470,23 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}
 	// ── Platform Scalability ──────────────────────────────────────────────────
 	if len(result.PerformanceRampSteps) > 0 {
 		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
 		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
 		var scalRows [][]string
 		for _, step := range result.PerformanceRampSteps {
 			scalRows = append(scalRows, []string{
 				fmt.Sprintf("%d", step.StepIndex),
 				joinIndexList(step.GPUIndices),
 				fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
 				fmt.Sprintf("%.1f%%", step.ScalabilityPct),
 			})
 		}
 		b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
 		b.WriteString("\n")
 	}
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
--- a/audit/internal/platform/benchmark_table.go
+++ b/audit/internal/platform/benchmark_table.go
@@ -0,0 +1,75 @@
 package platform
 import (
 	"strings"
 )
 // fmtMDTable renders a markdown table with column widths padded so the table
 // is readable as plain text without a markdown renderer.
 //
 // headers contains the column header strings.
 // rows contains data rows; each row must have the same number of cells as headers.
 // Cells with fewer entries than headers are treated as empty.
 func fmtMDTable(headers []string, rows [][]string) string {
 	ncols := len(headers)
 	if ncols == 0 {
 		return ""
 	}
 	// Compute max width per column.
 	widths := make([]int, ncols)
 	for i, h := range headers {
 		if len(h) > widths[i] {
 			widths[i] = len(h)
 		}
 	}
 	for _, row := range rows {
 		for i := 0; i < ncols; i++ {
 			cell := ""
 			if i < len(row) {
 				cell = row[i]
 			}
 			if len(cell) > widths[i] {
 				widths[i] = len(cell)
 			}
 		}
 	}
 	var b strings.Builder
 	// Header row.
 	b.WriteByte('|')
 	for i, h := range headers {
 		b.WriteByte(' ')
 		b.WriteString(h)
 		b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
 		b.WriteString(" |")
 	}
 	b.WriteByte('\n')
 	// Separator row.
 	b.WriteByte('|')
 	for i := range headers {
 		b.WriteString(strings.Repeat("-", widths[i]+2))
 		b.WriteByte('|')
 	}
 	b.WriteByte('\n')
 	// Data rows.
 	for _, row := range rows {
 		b.WriteByte('|')
 		for i := 0; i < ncols; i++ {
 			cell := ""
 			if i < len(row) {
 				cell = row[i]
 			}
 			b.WriteByte(' ')
 			b.WriteString(cell)
 			b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
 			b.WriteString(" |")
 		}
 		b.WriteByte('\n')
 	}
 	return b.String()
 }
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -1,8 +1,13 @@
 package platform
 import (
 	"context"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 )
 func TestResolveBenchmarkProfile(t *testing.T) {
@@ -49,8 +54,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
-	if len(labels) != 7 || len(phases) != 7 {
+	if len(labels) != 5 || len(phases) != 5 {
-		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
+		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +66,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
-	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -80,7 +85,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -99,7 +104,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -133,10 +138,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()
-	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
+	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
-	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
+	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 }
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	}
 }
 func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
 	t.Parallel()
 	before := BenchmarkThrottleCounters{}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
 		t.Fatalf("sw_power_cap should be ignored, got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
 		t.Fatalf("hw_power_brake should be ignored, got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
 		t.Fatalf("hw_thermal mismatch: got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
 		t.Fatalf("sw_thermal mismatch: got %q", got)
 	}
 }
 func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
 	t.Parallel()
 	oldGeteuid := benchmarkGeteuid
 	oldExec := satExecCommand
 	benchmarkGeteuid = func() int { return 1000 }
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		t.Fatalf("unexpected command: %s %v", name, args)
 		return nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		satExecCommand = oldExec
 	})
 	var logs []string
 	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
 		logs = append(logs, line)
 	})
 	if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
 		t.Fatalf("logs=%q want substring %q", got, want)
 	}
 	if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
 		t.Fatalf("failed=%v want [0 2]", failed)
 	}
 }
 func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
 	t.Parallel()
 	dir := t.TempDir()
 	script := filepath.Join(dir, "nvidia-smi")
 	argsLog := filepath.Join(dir, "args.log")
 	if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
 		t.Fatalf("write script: %v", err)
 	}
 	oldGeteuid := benchmarkGeteuid
 	oldSleep := benchmarkSleep
 	oldLookPath := satLookPath
 	benchmarkGeteuid = func() int { return 0 }
 	benchmarkSleep = func(time.Duration) {}
 	satLookPath = func(file string) (string, error) {
 		if file == "nvidia-smi" {
 			return script, nil
 		}
 		return exec.LookPath(file)
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		benchmarkSleep = oldSleep
 		satLookPath = oldLookPath
 	})
 	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
 	if len(failed) != 0 {
 		t.Fatalf("failed=%v want no failures", failed)
 	}
 	raw, err := os.ReadFile(argsLog)
 	if err != nil {
 		t.Fatalf("read args log: %v", err)
 	}
 	got := strings.Fields(string(raw))
 	want := []string{"-i", "2", "-r", "-i", "5", "-r"}
 	if strings.Join(got, " ") != strings.Join(want, " ") {
 		t.Fatalf("args=%v want %v", got, want)
 	}
 }
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()
@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	}
 }
 func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name string
 		info benchmarkGPUInfo
 		want int
 	}{
 		{
 			name: "prefers default tdp over current derated limit",
 			info: benchmarkGPUInfo{
 				PowerLimitW:        500,
 				DefaultPowerLimitW: 600,
 				MaxPowerLimitW:     600,
 			},
 			want: 600,
 		},
 		{
 			name: "caps default tdp to reported max limit",
 			info: benchmarkGPUInfo{
 				PowerLimitW:        500,
 				DefaultPowerLimitW: 700,
 				MaxPowerLimitW:     650,
 			},
 			want: 650,
 		},
 		{
 			name: "falls back to current limit when default missing",
 			info: benchmarkGPUInfo{
 				PowerLimitW:    525,
 				MaxPowerLimitW: 600,
 			},
 			want: 525,
 		},
 		{
 			name: "falls back to max limit when only that is known",
 			info: benchmarkGPUInfo{
 				MaxPowerLimitW: 575,
 			},
 			want: 575,
 		},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
 				t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
 			}
 		})
 	}
 }
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()
@@ -314,12 +459,40 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	}
 }
-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	t.Parallel()
 	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
 		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
 			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
 			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
 			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
 		},
 		PrecisionResults: []BenchmarkPrecisionResult{
 			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
 			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
 			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
 		},
 	})
 	if score.SyntheticScore != 100 {
 		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
 	}
 	if score.MixedScore != 50 {
 		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
 	}
 }
 func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Min Power Limit                       : 200.00 W
    Max Power Limit                       : 600.00 W
    Default Power Limit                   : 575.00 W
    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -341,7 +514,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -355,25 +528,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
 	}
 	if infoByIndex[0].MaxPowerLimitW != 600 {
 		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
 	}
 	if infoByIndex[0].DefaultPowerLimitW != 575 {
 		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
 	}
 	if infoByIndex[0].PowerLimitW != 560 {
 		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
 	}
 }
-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Min Power Limit                       : 100.00 W
    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
 			Index:               0,
 			BusID:               "00000000:4E:00.0",
 			MaxGraphicsClockMHz: 2430,
 			MaxMemoryClockMHz:   12481,
 			MinPowerLimitW:      200,
 			MaxPowerLimitW:      600,
 		},
 	}
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
 	}
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
@@ -42,40 +43,151 @@ const (
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )
 const (
 	BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
 	BenchmarkPowerEngineTargetedPower  = "targeted_power"
 )
 // Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
 // Rule: when changing profile phase durations in resolveBenchmarkProfile(),
 // re-measure from actual task logs and update the constants here.
 //
 // Sources:
 //   - BenchmarkEstimatedPerfStandardSec:   MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
 //   - BenchmarkEstimatedPerfStabilitySec:  xFusion v8.22 ramp 1-8: 5532 s
 //   - BenchmarkEstimatedPerfOvernightSec:  derived from profile phases (SteadySec=27000)
 //   - BenchmarkEstimatedPowerStandardSec:  MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
 //   - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
 const (
 	// Performance Benchmark (bee-gpu-burn).
 	// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
 	// Sequential per-GPU mode scales approximately linearly.
 	BenchmarkEstimatedPerfStandardSec  = 960  // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
 	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
 	BenchmarkEstimatedPerfOvernightSec = 8 * 3600
 	// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
 	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
 	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
 	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
 	BenchmarkEstimatedPowerOvernightSec = 3 * 3600
 )
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
 	ServerPowerSource string
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }
 const (
 	BenchmarkPowerSourceDCMI        = "dcmi"
 	BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
 )
 type BenchmarkPowerAutotuneConfig struct {
 	Version           int       `json:"version"`
 	UpdatedAt         time.Time `json:"updated_at"`
 	SelectedSource    string    `json:"selected_source"`
 	BenchmarkKind     string    `json:"benchmark_kind,omitempty"`
 	Profile           string    `json:"profile,omitempty"`
 	IdleDurationSec   int       `json:"idle_duration_sec,omitempty"`
 	LoadDurationSec   int       `json:"load_duration_sec,omitempty"`
 	SampleIntervalSec int       `json:"sample_interval_sec,omitempty"`
 	Confidence        float64   `json:"confidence,omitempty"`
 	Reason            string    `json:"reason,omitempty"`
 }
 type SystemPowerSourceDecision struct {
 	Configured      bool      `json:"configured"`
 	SelectedSource  string    `json:"selected_source,omitempty"`
 	EffectiveSource string    `json:"effective_source,omitempty"`
 	Mode            string    `json:"mode,omitempty"` // autotuned, fallback, degraded
 	Reason          string    `json:"reason,omitempty"`
 	ConfiguredAt    time.Time `json:"configured_at,omitempty"`
 }
 type BenchmarkPowerAutotuneResult struct {
 	GeneratedAt         time.Time                         `json:"generated_at"`
 	Hostname            string                            `json:"hostname,omitempty"`
 	ServerModel         string                            `json:"server_model,omitempty"`
 	BenchmarkKind       string                            `json:"benchmark_kind,omitempty"`
 	Profile             string                            `json:"profile,omitempty"`
 	Status              string                            `json:"status"`
 	IdleDurationSec     int                               `json:"idle_duration_sec"`
 	LoadDurationSec     int                               `json:"load_duration_sec"`
 	SampleIntervalSec   int                               `json:"sample_interval_sec"`
 	SelectedSource      string                            `json:"selected_source,omitempty"`
 	IdleValidationError string                            `json:"idle_validation_error,omitempty"`
 	IdleValidation      *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
 	GPUPowerIdleW       float64                           `json:"gpu_power_idle_w,omitempty"`
 	GPUPowerLoadW       float64                           `json:"gpu_power_load_w,omitempty"`
 	Candidates          []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
 	Notes               []string                          `json:"notes,omitempty"`
 	Config              *BenchmarkPowerAutotuneConfig     `json:"config,omitempty"`
 }
 type BenchmarkPowerAutotuneValidation struct {
 	Valid          bool    `json:"valid"`
 	GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
 	GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
 	CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
 	CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
 	GPUSamples     int     `json:"gpu_samples,omitempty"`
 	CPUSamples     int     `json:"cpu_samples,omitempty"`
 	Reason         string  `json:"reason,omitempty"`
 }
 type BenchmarkPowerAutotuneCandidate struct {
 	Source         string  `json:"source"`
 	IdleAvgW       float64 `json:"idle_avg_w,omitempty"`
 	LoadAvgW       float64 `json:"load_avg_w,omitempty"`
 	DeltaW         float64 `json:"delta_w,omitempty"`
 	Samples        int     `json:"samples,omitempty"`
 	RelativeError  float64 `json:"relative_error,omitempty"`
 	Confidence     float64 `json:"confidence,omitempty"`
 	Selected       bool    `json:"selected,omitempty"`
 	Available      bool    `json:"available"`
 	SelectionNotes string  `json:"selection_notes,omitempty"`
 }
 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
+	BenchmarkVersion string    `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
+	GeneratedAt      time.Time `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
+	Hostname         string    `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
+	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	// 100% = each added GPU contributes exactly its single-card throughput.
-	Findings           []string                     `json:"findings,omitempty"`
+	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
-	Warnings           []string                     `json:"warnings,omitempty"`
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	OverallStatus        string                       `json:"overall_status"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
-	Cooling            *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	Findings             []string                     `json:"findings,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	Warnings             []string                     `json:"warnings,omitempty"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
 	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
 	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
 	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
 	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
 	// sensor states before and after the benchmark run. Empty when IPMI is
 	// unavailable or no PSU faults occurred during the test.
 	PSUIssues []string `json:"psu_issues,omitempty"`
 }
 type BenchmarkNormalization struct {
@@ -107,6 +219,12 @@ type BenchmarkGPUResult struct {
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
 	// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
 	// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
 	ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
 	// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
 	// Fallback: 80°C.
 	SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
@@ -206,25 +324,87 @@ type BenchmarkScorecard struct {
 	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
-	StabilityScore      float64 `json:"stability_score"`
+	// StabilityScore: fraction of steady-state time the GPU spent throttling
-	InterconnectScore   float64 `json:"interconnect_score"`
+	// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
-	CompositeScore      float64 `json:"composite_score"`
+	StabilityScore float64 `json:"stability_score"`
 	// Throttle breakdown — percentage of steady-state time in each throttle type.
 	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
 	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
 	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
 	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
 	// Temperature headroom: distance to the 100°C destruction threshold.
 	// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
 	// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
 	TempHeadroomC float64 `json:"temp_headroom_c"`
 	InterconnectScore float64 `json:"interconnect_score"`
 	// ServerQualityScore (0–100) reflects server infrastructure quality independent
 	// of GPU model. Combines throttle time, power variance, and temp variance.
 	// Use this to compare servers with the same GPU, or to flag a bad server
 	// that throttles an otherwise fast GPU.
 	ServerQualityScore float64 `json:"server_quality_score"`
 	// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
 	// A throttling GPU will score lower here automatically — no quality multiplier.
 	CompositeScore float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }
-// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
+// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
-// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
+// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
-// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
+// so benchmark and audit data can be correlated by slot.
-// over-reporting its power consumption.
+type BenchmarkPSUSlotPower struct {
 	InputW  *float64 `json:"input_w,omitempty"`  // AC wall input (PSUx_POWER_IN)
 	OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
 	Status  string   `json:"status,omitempty"`
 }
 // BenchmarkServerPower captures server-side power from multiple independent
 // sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
 // GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
 // covers only a subset of installed PSUs (partial coverage).
 //
 // Source legend:
 //   - DCMI      — `ipmitool dcmi power reading`; fast but may miss PSUs
 //   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
 //   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool     `json:"available"`
+	Available         bool    `json:"available"`
-	IdleW           float64  `json:"idle_w,omitempty"`
+	Source            string  `json:"source,omitempty"`
-	LoadedW         float64  `json:"loaded_w,omitempty"`
+	Mode              string  `json:"mode,omitempty"`
-	DeltaW          float64  `json:"delta_w,omitempty"`
+	Reason            string  `json:"reason,omitempty"`
-	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
+	SampleIntervalSec int     `json:"sample_interval_sec,omitempty"`
-	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
+	IdleW             float64 `json:"idle_w,omitempty"`   // DCMI at idle
-	Notes           []string `json:"notes,omitempty"`
+	LoadedW           float64 `json:"loaded_w,omitempty"` // DCMI at peak load
 	DeltaW            float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
 	GPUReportedSumW   float64 `json:"gpu_reported_sum_w,omitempty"`
 	ReportingRatio    float64 `json:"reporting_ratio,omitempty"`
 	// PSU AC input sum — sampled at idle and at peak load using collector's
 	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
 	PSUInputIdleW   float64 `json:"psu_input_idle_w,omitempty"`
 	PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
 	// PSU DC output sum — power delivered to server internals after conversion.
 	PSUOutputIdleW   float64 `json:"psu_output_idle_w,omitempty"`
 	PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
 	// Per-slot PSU readings at idle and at peak load.
 	// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
 	PSUSlotReadingsIdle   map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
 	PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
 	// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
 	// PCIe slot delivery only (excludes 16-pin connector power).
 	GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
 	// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
 	// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
 	DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
 	Notes []string `json:"notes,omitempty"`
 }
 // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
@@ -265,16 +445,35 @@ type NvidiaPowerBenchResult struct {
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
-	Findings             []string               `json:"findings,omitempty"`
+	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
-	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
+	// cumulative thermal ramp. Represents the actual sustained power budget of
 	// this server under full GPU load. Use for rack power planning.
 	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
 	// ServerPower captures IPMI server power delta (idle→loaded) measured in
 	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
 	// actual wall-power draw as seen by the server's power supply.
 	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
 	Findings    []string              `json:"findings,omitempty"`
 	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
 	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
 	// sensor states before and after the power benchmark run. Empty when IPMI is
 	// unavailable or no PSU faults occurred during the test.
 	PSUIssues []string `json:"psu_issues,omitempty"`
 }
 type NvidiaPowerBenchGPU struct {
-	Index               int      `json:"index"`
+	Index              int     `json:"index"`
-	Name                string   `json:"name,omitempty"`
+	Name               string  `json:"name,omitempty"`
-	BusID               string   `json:"bus_id,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
-	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
-	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	// AppliedPowerLimitW is the stable limit found during single-card calibration.
 	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
 	// StablePowerLimitW is the final fixed limit for this GPU after the
 	// cumulative thermal ramp. This is the limit at which the GPU operated
 	// stably with all other GPUs running simultaneously at their own limits.
 	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
 	// additional derating.
 	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
@@ -283,16 +482,55 @@ type NvidiaPowerBenchGPU struct {
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 	// ServerLoadedW is the IPMI server power reading captured during this
 	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
 	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
 	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 	// Telemetry holds the aggregated stats from the final converged calibration
 	// attempt for this GPU (temperature, power, fan, clock percentiles).
 	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
 	// Fan state sampled at the end of single-card calibration.
 	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
 	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 }
 type NvidiaPowerBenchStep struct {
-	StepIndex              int      `json:"step_index"`
+	StepIndex  int   `json:"step_index"`
-	GPUIndices             []int    `json:"gpu_indices"`
+	GPUIndices []int `json:"gpu_indices"`
-	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
+	// NewGPUIndex is the GPU whose stable limit was searched in this step.
-	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
+	NewGPUIndex int `json:"new_gpu_index"`
-	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
+	// NewGPUStableLimitW is the stable power limit found for the new GPU.
-	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
+	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
-	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
+	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
-	Status                 string   `json:"status"`
+	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
-	Notes                  []string `json:"notes,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	Notes               []string `json:"notes,omitempty"`
 	// ServerLoadedW is the IPMI server power reading captured during this
 	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
 	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
 	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 	// PSU slot readings sampled at end of this ramp step.
 	PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
 	// Fan state at end of this ramp step.
 	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
 	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 	// Per-GPU telemetry from this step's calibration, keyed by GPU index.
 	PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
 }
 // NvidiaPerformanceRampStep holds per-step performance data for the
 // scalability ramp-up phase of the performance benchmark.
 type NvidiaPerformanceRampStep struct {
 	StepIndex  int   `json:"step_index"`
 	GPUIndices []int `json:"gpu_indices"`
 	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
 	// TOPS from dedicated single-precision phases) across all GPUs in this step.
 	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
 	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
 	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
 	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
 	ScalabilityPct float64  `json:"scalability_pct"`
 	Status         string   `json:"status"`
 	Notes          []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }
 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
 		dutyAvail := 0
 		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
-		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
+		dutyEstimated := 0
-			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
+		if r.FanDutyCycleEstimated {
 			dutyEstimated = 1
 		}
 		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
 			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -12,6 +12,7 @@ import (
 )
 const installToRAMDir = "/dev/shm/bee-live"
 const copyProgressLogStep int64 = 100 * 1024 * 1024
 func (s *System) IsLiveMediaInRAM() bool {
 	return s.LiveMediaRAMState().InRAM
@@ -140,26 +141,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 	}
 	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
-	if err != nil || len(squashfsFiles) == 0 {
+	sourceAvailable := err == nil && len(squashfsFiles) > 0
 		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
 	}
 	free := freeMemBytes()
 	var needed int64
 	for _, sf := range squashfsFiles {
 		fi, err2 := os.Stat(sf)
 		if err2 != nil {
 			return fmt.Errorf("stat %s: %v", sf, err2)
 		}
 		needed += fi.Size()
 	}
 	const headroom = 256 * 1024 * 1024
 	if free > 0 && needed+headroom > free {
 		return fmt.Errorf("insufficient RAM: need %s, available %s",
 			humanBytes(needed+headroom), humanBytes(free))
 	}
 	dstDir := installToRAMDir
 	// If the source medium is unavailable, check whether a previous run already
 	// produced a complete copy in RAM. If so, skip the copy phase and proceed
 	// directly to the loop-rebind / bind-mount steps.
 	if !sourceAvailable {
 		copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
 		if len(copiedFiles) > 0 {
 			log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
 			// Proceed to rebind with the already-copied files.
 			for _, dst := range copiedFiles {
 				base := filepath.Base(dst)
 				// Re-associate the loop device that was originally backed by the
 				// source file (now gone); find it by the old source path pattern.
 				srcGuess := "/run/live/medium/live/" + base
 				loopDev, lerr := findLoopForFile(srcGuess)
 				if lerr != nil {
 					log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
 					continue
 				}
 				if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
 					log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
 				} else {
 					log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
 				}
 			}
 			goto bindMedium
 		}
 		return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
 	}
 	{
 		free := freeMemBytes()
 		var needed int64
 		for _, sf := range squashfsFiles {
 			fi, err2 := os.Stat(sf)
 			if err2 != nil {
 				return fmt.Errorf("stat %s: %v", sf, err2)
 			}
 			needed += fi.Size()
 		}
 		const headroom = 256 * 1024 * 1024
 		if free > 0 && needed+headroom > free {
 			return fmt.Errorf("insufficient RAM: need %s, available %s",
 				humanBytes(needed+headroom), humanBytes(free))
 		}
 	}
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
@@ -199,6 +230,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 		}
 	}
 bindMedium:
 	log("Copying remaining medium files...")
 	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
 		log(fmt.Sprintf("Warning: partial copy: %v", err))
@@ -288,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	defer out.Close()
 	total := fi.Size()
 	var copied int64
 	var lastLogged int64
 	buf := make([]byte, 4*1024*1024)
 	for {
 		if err := ctx.Err(); err != nil {
@@ -299,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 				return werr
 			}
 			copied += int64(n)
-			if logFunc != nil && total > 0 {
+			if shouldLogCopyProgress(copied, total, lastLogged) {
 				lastLogged = copied
 				pct := int(float64(copied) / float64(total) * 100)
 				logFunc(fmt.Sprintf("  %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
 			}
@@ -314,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	return out.Sync()
 }
 func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
 	if total <= 0 || copied <= 0 {
 		return false
 	}
 	if copied >= total {
 		return copied > lastLogged
 	}
 	if copied < copyProgressLogStep {
 		return false
 	}
 	return copied-lastLogged >= copyProgressLogStep
 }
 func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
 	return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
 		if ctx.Err() != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -101,3 +101,26 @@ func TestEvaluateLiveMediaRAMState(t *testing.T) {
 		}
 	})
 }
 func TestShouldLogCopyProgress(t *testing.T) {
 	t.Parallel()
 	total := int64(250 * 1024 * 1024)
 	step := int64(100 * 1024 * 1024)
 	if shouldLogCopyProgress(step-1, total, 0) {
 		t.Fatal("progress logged too early")
 	}
 	if !shouldLogCopyProgress(step, total, 0) {
 		t.Fatal("expected log at first 100MB boundary")
 	}
 	if shouldLogCopyProgress(step+16*1024*1024, total, step) {
 		t.Fatal("progress logged again before next 100MB")
 	}
 	if !shouldLogCopyProgress(2*step, total, step) {
 		t.Fatal("expected log at second 100MB boundary")
 	}
 	if !shouldLogCopyProgress(total, total, 2*step) {
 		t.Fatal("expected final completion log")
 	}
 }
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -1,11 +1,14 @@
 package platform
 import (
 	"context"
 	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
 	"time"
 )
 // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
@@ -30,7 +33,12 @@ type KilledProcess struct {
 // KillTestWorkers scans /proc for running test worker processes and sends
 // SIGKILL to each one found. It returns a list of killed processes.
 // Errors for individual processes (e.g. already exited) are silently ignored.
 // The scan runs under a 5-second deadline to avoid blocking if the process
 // table is very large (e.g. after a stress test with thousands of children).
 func KillTestWorkers() []KilledProcess {
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
 	entries, err := os.ReadDir("/proc")
 	if err != nil {
 		return nil
@@ -38,6 +46,13 @@ func KillTestWorkers() []KilledProcess {
 	var killed []KilledProcess
 	for _, e := range entries {
 		select {
 		case <-ctx.Done():
 			slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
 			return killed
 		default:
 		}
 		if !e.IsDir() {
 			continue
 		}
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -1,8 +1,10 @@
 package platform
 import (
 	"bee/audit/internal/collector"
 	"bufio"
 	"encoding/json"
 	"fmt"
 	"os"
 	"os/exec"
 	"sort"
@@ -14,13 +16,24 @@ import (
 // LiveMetricSample is a single point-in-time snapshot of server metrics
 // collected for the web UI metrics page.
 type LiveMetricSample struct {
-	Timestamp  time.Time      `json:"ts"`
+	Timestamp   time.Time      `json:"ts"`
-	Fans       []FanReading   `json:"fans"`
+	Fans        []FanReading   `json:"fans"`
-	Temps      []TempReading  `json:"temps"`
+	Temps       []TempReading  `json:"temps"`
-	PowerW     float64        `json:"power_w"`
+	PowerW      float64        `json:"power_w"`
-	CPULoadPct float64        `json:"cpu_load_pct"`
+	PowerSource string         `json:"power_source,omitempty"`
-	MemLoadPct float64        `json:"mem_load_pct"`
+	PowerMode   string         `json:"power_mode,omitempty"`
-	GPUs       []GPUMetricRow `json:"gpus"`
+	PowerReason string         `json:"power_reason,omitempty"`
 	PSUs        []PSUReading   `json:"psus,omitempty"`
 	CPULoadPct  float64        `json:"cpu_load_pct"`
 	MemLoadPct  float64        `json:"mem_load_pct"`
 	GPUs        []GPUMetricRow `json:"gpus"`
 }
 // PSUReading is a per-slot power supply input power reading.
 type PSUReading struct {
 	Slot   int     `json:"slot"`
 	Name   string  `json:"name"`
 	PowerW float64 `json:"power_w"`
 }
 // TempReading is a named temperature sensor value.
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
 		}
 	}
-	// System power — returns 0 if unavailable
+	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
-	s.PowerW = sampleSystemPower()
+	s.PSUs = samplePSUPower()
 	// System power: use the global autotune-selected source when configured,
 	// otherwise fall back to the historical heuristic and mark the mode.
 	if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
 		s.PowerW = powerW
 		s.PowerSource = decision.EffectiveSource
 		s.PowerMode = decision.Mode
 		s.PowerReason = decision.Reason
 	}
 	// CPU load — from /proc/stat
 	s.CPULoadPct = sampleCPULoadPct()
@@ -326,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
 	}
 	return chip + " / " + name
 }
 // samplePSUPower reads per-PSU input power via IPMI SDR.
 // Uses collector.PSUSlotsFromSDR (name-based matching) which works across
 // vendors where PSU sensors may not carry entity ID "10.N".
 // Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
 func samplePSUPower() []PSUReading {
 	out, err := exec.Command("ipmitool", "sdr").Output()
 	if err != nil || len(out) == 0 {
 		return nil
 	}
 	slots := collector.PSUSlotsFromSDR(string(out))
 	if len(slots) == 0 {
 		return nil
 	}
 	// Collect slot keys and sort for stable output.
 	keys := make([]int, 0, len(slots))
 	for k := range slots {
 		n, err := strconv.Atoi(k)
 		if err == nil {
 			keys = append(keys, n)
 		}
 	}
 	sort.Ints(keys)
 	psus := make([]PSUReading, 0, len(keys))
 	for _, k := range keys {
 		entry := slots[strconv.Itoa(k)]
 		// Prefer AC input power; fall back to DC output power.
 		var w float64
 		if entry.InputW != nil && *entry.InputW > 0 {
 			w = *entry.InputW
 		} else if entry.OutputW != nil && *entry.OutputW > 0 {
 			w = *entry.OutputW
 		}
 		if w <= 0 {
 			continue
 		}
 		psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
 	}
 	if len(psus) == 0 {
 		return nil
 	}
 	return psus
 }
--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -0,0 +1,30 @@
 package platform
 import (
 	"fmt"
 	"os/exec"
 	"time"
 )
 const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
 func runNvidiaRecover(args ...string) (string, error) {
 	helperArgs := append([]string{nvidiaRecoverHelper}, args...)
 	if _, err := exec.LookPath("systemd-run"); err == nil {
 		unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
 		cmdArgs := []string{
 			"systemd-run",
 			"--quiet",
 			"--pipe",
 			"--wait",
 			"--collect",
 			"--service-type=oneshot",
 			"--unit", unit,
 		}
 		cmdArgs = append(cmdArgs, helperArgs...)
 		raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
 		return string(raw), err
 	}
 	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
 	return string(raw), err
 }
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
 	"bee-audit",
 	"bee-web",
 	"bee-sshsetup",
 	"nvidia-dcgm",
 	"nvidia-fabricmanager",
 }
 func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -20,6 +20,54 @@ import (
 	"time"
 )
 // Estimated wall-clock durations for each SAT/validate test, derived from real
 // production logs in _benchmark/_v8/.
 //
 // Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
 // the corresponding Run*Pack function change, re-measure the wall-clock duration
 // from actual task logs and update the matching constant here.
 //
 // Sources:
 //   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
 //   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
 //   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
 const (
 	// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
 	SATEstimatedCPUValidateSec = 65
 	// CPU stress: stress-ng 1800 s (stress mode default).
 	SATEstimatedCPUStressSec = 1800
 	// RAM: memtester 256 MB / 1 pass.
 	SATEstimatedMemoryValidateSec = 70
 	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
 	SATEstimatedMemoryStressSec = 140
 	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
 	SATEstimatedNvidiaGPUValidateSec = 85
 	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
 	SATEstimatedNvidiaGPUStressSec = 450
 	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
 	SATEstimatedNvidiaTargetedStressSec = 350
 	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
 	SATEstimatedNvidiaTargetedPowerSec = 350
 	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
 	SATEstimatedNvidiaPulseTestSec = 5000
 	// NCCL all_reduce_perf, all GPUs simultaneously.
 	SATEstimatedNvidiaInterconnectSec = 300
 	// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
 	// without a user-configurable time limit; duration is determined by nvbandwidth itself.
 	SATEstimatedNvidiaBandwidthSec = 2700
 )
 var (
 	satExecCommand  = exec.Command
 	satLookPath     = exec.LookPath
@@ -359,19 +407,21 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	if index < 0 {
 		return "", fmt.Errorf("gpu index must be >= 0")
 	}
-	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
-	if len(raw) == 0 && err == nil {
+	if strings.TrimSpace(out) == "" && err == nil {
-		raw = []byte("GPU reset completed.\n")
+		out = "GPU reset completed.\n"
 	}
-	return string(raw), err
+	return out, err
 }
-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
-	// detect GPU count
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
+	if err != nil {
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+		return "", err
 	}
 	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +430,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }
@@ -393,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 		profCmd []string
 		profEnv []string
 	)
-	if staggerSec > 0 && len(selected) > 1 {
+	if len(selected) > 1 {
 		// For multiple GPUs, always spawn one dcgmproftester process per GPU via
 		// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
 		// A single dcgmproftester process without -i only loads GPU 0 regardless
 		// of CUDA_VISIBLE_DEVICES.
 		stagger := staggerSec
 		if stagger < 0 {
 			stagger = 0
 		}
 		profCmd = []string{
 			"bee-dcgmproftester-staggered",
 			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
-			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--stagger-seconds", strconv.Itoa(stagger),
 			"--devices", joinIndexList(selected),
 		}
 	} else {
@@ -426,6 +484,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -443,6 +508,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -460,6 +532,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
 	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
 	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -552,10 +631,16 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
-	// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
+	// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
-	// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
+	// intentionally conservative enough for healthy systems while avoiding the
-	// controller can cause memtester to spin forever on a single subtest.
+	// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
-	timeoutSec := sizeMB*passes*150/100 + 120
+	timeoutSec := sizeMB*passes*20/100 + 60
 	if timeoutSec < 180 {
 		timeoutSec = 180
 	}
 	if timeoutSec > 900 {
 		timeoutSec = 900
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -42,27 +43,56 @@ type GPUStressMetric struct {
 // FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
 type FanStressRow struct {
-	TimestampUTC string
+	TimestampUTC   string
-	ElapsedSec   float64
+	ElapsedSec     float64
-	Phase        string // "baseline", "load1", "pause", "load2", "cooldown"
+	Phase          string // "baseline", "load1", "pause", "load2", "cooldown"
-	GPUs         []GPUStressMetric
+	GPUs           []GPUStressMetric
-	Fans         []FanReading
+	Fans           []FanReading
-	CPUMaxTempC  float64 // highest CPU temperature from ipmitool / sensors
+	CPUMaxTempC    float64 // highest CPU temperature from ipmitool / sensors
-	SysPowerW    float64 // DCMI system power reading
+	SysPowerW      float64
 	SysPowerSource string
 	SysPowerMode   string
 }
 type cachedPowerReading struct {
 	Value     float64
 	Source    string
 	Mode      string
 	Reason    string
 	UpdatedAt time.Time
 }
 type fanObservationState struct {
 	MaxRPM map[string]float64 `json:"max_rpm"`
 }
 type fanPeakCandidate struct {
 	FirstSeen time.Time
 	RPM       float64
 }
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
 	fanObservationMu   sync.Mutex
 	fanObservation     fanObservationState
 	fanObservationInit bool
 	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )
 const systemPowerHoldTTL = 15 * time.Second
 var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
 const fanObservationMinPeakHold = time.Second
 func normalizeObservedFanMaxRPM(rpm float64) float64 {
 	if rpm <= 0 {
 		return 0
 	}
 	return math.Ceil(rpm/1000.0) * 1000.0
 }
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -253,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
 	row.GPUs = sampleGPUStressMetrics(gpuIndices)
 	row.Fans, _ = sampleFanSpeeds()
 	row.CPUMaxTempC = sampleCPUMaxTemp()
-	row.SysPowerW = sampleSystemPower()
+	row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
 	return row
 }
@@ -310,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
 			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
 		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }
 func loadFanObservationLocked() {
 	if fanObservationInit {
 		return
 	}
 	fanObservationInit = true
 	fanObservation.MaxRPM = make(map[string]float64)
 	raw, err := os.ReadFile(fanObservationStatePath)
 	if err != nil || len(raw) == 0 {
 		return
 	}
 	var persisted fanObservationState
 	if json.Unmarshal(raw, &persisted) != nil {
 		return
 	}
 	for name, rpm := range persisted.MaxRPM {
 		name = strings.TrimSpace(name)
 		if name == "" || rpm <= 0 {
 			continue
 		}
 		fanObservation.MaxRPM[name] = rpm
 	}
 }
 func saveFanObservationLocked() {
 	if len(fanObservation.MaxRPM) == 0 {
 		return
 	}
 	dir := filepath.Dir(fanObservationStatePath)
 	if dir == "" || dir == "." {
 		dir = "/var/log/bee-sat"
 	}
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		return
 	}
 	raw, err := json.MarshalIndent(fanObservation, "", "  ")
 	if err != nil {
 		return
 	}
 	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
 }
 func updateFanObservation(fans []FanReading, now time.Time) {
 	if len(fans) == 0 {
 		return
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	changed := false
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		currentMax := fanObservation.MaxRPM[name]
 		if fan.RPM <= currentMax {
 			delete(fanPeakCandidates, name)
 			continue
 		}
 		if cand, ok := fanPeakCandidates[name]; ok {
 			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
 				newMax := math.Max(cand.RPM, fan.RPM)
 				if newMax > currentMax {
 					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
 					changed = true
 				}
 				delete(fanPeakCandidates, name)
 				continue
 			}
 			if fan.RPM > cand.RPM {
 				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
 			}
 			continue
 		}
 		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
 	}
 	if changed {
 		saveFanObservationLocked()
 	}
 }
 func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
 	if len(fans) == 0 {
 		return 0, false
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	var samples []float64
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		maxRPM := fanObservation.MaxRPM[name]
 		if maxRPM <= 0 {
 			continue
 		}
 		pct := fan.RPM / maxRPM * 100.0
 		if pct > 100 {
 			pct = 100
 		}
 		if pct < 0 {
 			pct = 0
 		}
 		samples = append(samples, pct)
 	}
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -428,12 +573,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
-func sampleFanDutyCyclePct() (float64, bool) {
+func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
-		return 0, false
+		fans, fanErr := sampleFanSpeeds()
 		if fanErr != nil {
 			return 0, false, false
 		}
 		return sampleFanDutyCyclePctFromFans(fans)
 	}
-	return parseFanDutyCyclePctSensorsJSON(out)
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
 	return pct, ok, false
 }
 func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
 	if len(fans) == 0 {
 		return 0, false, false
 	}
 	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
 		return pct, true, true
 	}
 	return 0, false, false
 }
 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
@@ -608,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
 	return max
 }
-// sampleSystemPower reads system power draw via DCMI.
+// sampleSystemPowerResolved reads system power via the global autotune source,
-func sampleSystemPower() float64 {
+// falling back to the historical heuristic before autotune or when degraded.
 func sampleSystemPowerResolved() (float64, string, string) {
 	now := time.Now()
-	current := 0.0
+	current, decision, err := SampleSystemPowerResolved("")
 	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
 	if err == nil {
 		current = parseDCMIPowerReading(string(out))
 	}
 	systemPowerCacheMu.Lock()
 	defer systemPowerCacheMu.Unlock()
-	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	if err != nil {
 		current = 0
 	}
 	value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
 	systemPowerCache = updated
-	return value
+	return value, updated.Source, updated.Mode
 }
 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -643,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }
-func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
 	if current > 0 {
-		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
 		return current, cache
 	}
 	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform
 import (
 	"path/filepath"
 	"testing"
 	"time"
 )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	}
 }
 func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
 	t.Parallel()
 	oldPath := fanObservationStatePath
 	oldState := fanObservation
 	oldInit := fanObservationInit
 	oldCandidates := fanPeakCandidates
 	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	t.Cleanup(func() {
 		fanObservationStatePath = oldPath
 		fanObservation = oldState
 		fanObservationInit = oldInit
 		fanPeakCandidates = oldCandidates
 	})
 	start := time.Unix(100, 0)
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
 	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
 		t.Fatalf("single-sample spike should not establish observed max")
 	}
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
 	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected estimated duty cycle from persisted observed max")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("got=%v want ~43.3", got)
 	}
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected persisted observed max to be reloaded from disk")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("reloaded got=%v want ~43.3", got)
 	}
 }
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
@@ -64,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	now := time.Now()
 	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
-	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
 	if got != 480 {
 		t.Fatalf("got=%v want cached 480", got)
 	}
@@ -72,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 		t.Fatalf("updated=%+v", updated)
 	}
-	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
 	if got != 530 {
 		t.Fatalf("got=%v want 530", got)
 	}
@@ -81,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	}
 	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
-	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
 	if got != 0 {
 		t.Fatalf("expired cache returned %v want 0", got)
 	}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }
 func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
 	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
 	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
 	if len(cmd) != len(want) {
 		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
 	}
 	for i := range want {
 		if cmd[i] != want[i] {
 			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
 		}
 	}
 }
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
 }
 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
 	if name == "bee-nvidia" && action == ServiceRestart {
 		return runNvidiaRecover("restart-drivers")
 	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstallToRAM
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-bench-perf", "nvidia-bench-power":
+	case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 		}
 		if rampUp && len(body.GPUIndices) > 1 {
-			// Ramp-up mode: resolve GPU list, then create one task per prefix
+			// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
-			// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
+			// in Phase 2 (one additional GPU per step). A single task with all
 			// selected GPUs is sufficient — spawning N tasks with growing subsets
 			// would repeat all earlier steps redundantly.
 			gpus, err := apiListNvidiaGPUs(h.opts.App)
 			if err != nil {
 				writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 			} else {
 				now := time.Now()
 				rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
-				var allTasks []*Task
+				taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
-				for step := 1; step <= len(resolved); step++ {
+				t := &Task{
-					subset := resolved[:step]
+					ID:        newJobID("bee-bench-nvidia"),
-					stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
+					Name:      taskName,
-					t := &Task{
+					Target:    target,
-						ID:        newJobID("bee-bench-nvidia"),
+					Priority:  defaultTaskPriority(target, taskParams{}),
-						Name:      stepName,
+					Status:    TaskPending,
-						Target:    target,
+					CreatedAt: now,
-						Priority:  defaultTaskPriority(target, taskParams{}),
+					params: taskParams{
-						Status:    TaskPending,
+						GPUIndices:       append([]int(nil), resolved...),
-						CreatedAt: now,
+						SizeMB:           body.SizeMB,
-						params: taskParams{
+						BenchmarkProfile: body.Profile,
-							GPUIndices:       append([]int(nil), subset...),
+						RunNCCL:          runNCCL,
-							SizeMB:           body.SizeMB,
+						ParallelGPUs:     true,
-							BenchmarkProfile: body.Profile,
+						RampTotal:        len(resolved),
-							RunNCCL:          runNCCL && step == len(resolved),
+						RampRunID:        rampRunID,
-							ParallelGPUs:     true,
+						DisplayName:      taskName,
-							RampStep:         step,
+					},
 							RampTotal:        len(resolved),
 							RampRunID:        rampRunID,
 							DisplayName:      stepName,
 						},
 					}
 					allTasks = append(allTasks, t)
 				}
-				for _, t := range allTasks {
+				globalQueue.enqueue(t)
-					globalQueue.enqueue(t)
+				writeTaskRunResponse(w, []*Task{t})
 				}
 				writeTaskRunResponse(w, allTasks)
 				return
 			}
 		}
@@ -707,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 	}
 }
 func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		if h.opts.App == nil {
 			writeError(w, http.StatusServiceUnavailable, "app not configured")
 			return
 		}
 		var body struct {
 			Profile       string `json:"profile"`
 			BenchmarkKind string `json:"benchmark_kind"`
 			SizeMB        int    `json:"size_mb"`
 		}
 		if r.Body != nil {
 			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
 				writeError(w, http.StatusBadRequest, "invalid request body")
 				return
 			}
 		}
 		profile := strings.TrimSpace(body.Profile)
 		if profile == "" {
 			profile = "standard"
 		}
 		benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
 		if benchmarkKind == "" {
 			benchmarkKind = "power-fit"
 		}
 		now := time.Now()
 		taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
 		t := &Task{
 			ID:        newJobID("bee-bench-autotune"),
 			Name:      taskName,
 			Target:    "nvidia-bench-autotune",
 			Priority:  defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
 			Status:    TaskPending,
 			CreatedAt: now,
 			params: taskParams{
 				BenchmarkProfile: profile,
 				BenchmarkKind:    benchmarkKind,
 				SizeMB:           body.SizeMB,
 				DisplayName:      taskName,
 			},
 		}
 		globalQueue.enqueue(t)
 		writeTaskRunResponse(w, []*Task{t})
 	}
 }
 func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
 	cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
 	if err != nil {
 		if os.IsNotExist(err) {
 			w.WriteHeader(http.StatusOK)
 			writeJSON(w, map[string]any{
 				"configured": false,
 				"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
 			})
 			return
 		}
 		writeError(w, http.StatusInternalServerError, err.Error())
 		return
 	}
 	w.WriteHeader(http.StatusOK)
 	writeJSON(w, map[string]any{
 		"configured": true,
 		"config":     cfg,
 		"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
 	})
 }
 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }
@@ -743,6 +809,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
 			if t.job != nil {
 				t.job.abort()
 			}
 			if taskMayLeaveOrphanWorkers(t.Target) {
 				platform.KillTestWorkers()
 			}
 			t.Status = TaskCancelled
 			now := time.Now()
 			t.DoneAt = &now
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -178,16 +178,54 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
-	if len(globalQueue.tasks) != 3 {
+	// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
-		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
+	// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
 	if len(globalQueue.tasks) != 1 {
 		t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
 	}
-	for i, task := range globalQueue.tasks {
+	task := globalQueue.tasks[0]
-		if task.Target != "nvidia-bench-power" {
+	if task.Target != "nvidia-bench-power" {
-			t.Fatalf("task[%d] target=%q", i, task.Target)
+		t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
-		}
+	}
-		if task.Priority != taskPriorityBenchmark {
+	if task.Priority != taskPriorityBenchmark {
-			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
+		t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
-		}
+	}
 	if task.params.RampTotal != 3 {
 		t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
 	}
 }
 func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 1 {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
 	if task.Target != "nvidia-bench-autotune" {
 		t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
 	}
 	if task.params.BenchmarkKind != "power-fit" {
 		t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
 	}
 }
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
 	return out
 }
 // renderStackedMetricChartSVG renders a stacked area chart where each dataset
 // is visually "stacked" on top of the previous one. Intended for multi-PSU
 // power charts where the filled area of each PSU shows its individual
 // contribution and the total height equals the combined draw.
 func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	if pointCount == 0 {
 		pointCount = 1
 		labels = []string{""}
 		times = []time.Time{{}}
 	}
 	if len(labels) < pointCount {
 		padded := make([]string, pointCount)
 		copy(padded, labels)
 		labels = padded
 	}
 	if len(times) < pointCount {
 		times = synthesizeChartTimes(times, pointCount)
 	}
 	for i := range datasets {
 		if len(datasets[i]) == 0 {
 			datasets[i] = make([]float64, pointCount)
 		}
 	}
 	times, datasets = downsampleTimeSeries(times, datasets, 1400)
 	pointCount = len(times)
 	// Build cumulative sums per time point.
 	cumulative := make([][]float64, len(datasets)+1)
 	for i := range cumulative {
 		cumulative[i] = make([]float64, pointCount)
 	}
 	for i, ds := range datasets {
 		for j, v := range ds {
 			cumulative[i+1][j] = cumulative[i][j] + v
 		}
 	}
 	// Scale is based on the total (top cumulative row).
 	total := cumulative[len(cumulative)-1]
 	yMin := floatPtr(0)
 	if yMax == nil {
 		yMax = autoMax120(total)
 	}
 	scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
 	legendItems := make([]metricChartSeries, len(datasets))
 	for i, name := range names {
 		color := metricChartPalette[i%len(metricChartPalette)]
 		legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
 	}
 	// Stats label from totals.
 	statsLabel := chartStatsLabel([][]float64{total})
 	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
 	start, end := chartTimeBounds(times)
 	var b strings.Builder
 	writeSVGOpen(&b, layout.Width, layout.Height)
 	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scale)
 	writeTimelineBoundaries(&b, layout, start, end, timeline)
 	writePlotBorder(&b, layout)
 	writeSingleAxisY(&b, layout, scale)
 	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
 	// Draw stacked areas from top to bottom so lower layers are visible.
 	for i := len(datasets) - 1; i >= 0; i-- {
 		writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
 	}
 	// Draw border polylines on top.
 	for i := len(datasets) - 1; i >= 0; i-- {
 		writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
 	}
 	writeLegend(&b, layout, legendItems)
 	writeSVGClose(&b)
 	return []byte(b.String()), nil
 }
 // writeStackedArea draws a filled polygon between two cumulative value arrays
 // (baseline and top), using the given color at 55% opacity.
 func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
 	n := len(top)
 	if n == 0 {
 		return
 	}
 	if len(baseline) < n {
 		baseline = make([]float64, n)
 	}
 	// Forward path along top values, then backward along baseline values.
 	var points strings.Builder
 	for i := 0; i < n; i++ {
 		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
 		if i > 0 {
 			points.WriteByte(' ')
 		}
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	for i := n - 1; i >= 0; i-- {
 		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
 		points.WriteByte(' ')
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
 }
 func writeSVGOpen(b *strings.Builder, width, height int) {
 	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
 }
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -1,6 +1,9 @@
 package webui
 import (
 	"bufio"
 	"fmt"
 	"io"
 	"os"
 	"strings"
 	"sync"
@@ -17,6 +20,25 @@ type jobState struct {
 	cancel       func() // optional cancel function; nil if job is not cancellable
 	logPath      string
 	serialPrefix string
 	logFile      *os.File    // kept open for the task lifetime to avoid per-line open/close
 	logBuf       *bufio.Writer
 }
 // readTaskLogFile reads a task log, refusing files over 50 MB.
 func readTaskLogFile(path string) ([]byte, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, err
 	}
 	defer f.Close()
 	data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
 	if err != nil {
 		return nil, err
 	}
 	if int64(len(data)) > 50<<20 {
 		return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
 	}
 	return data, nil
 }
 // abort cancels the job if it has a cancel function and is not yet done.
@@ -35,7 +57,7 @@ func (j *jobState) append(line string) {
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
 	if j.logPath != "" {
-		appendJobLog(j.logPath, line)
+		j.writeLogLineLocked(line)
 	}
 	if j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
@@ -48,6 +70,35 @@ func (j *jobState) append(line string) {
 	}
 }
 // writeLogLineLocked writes a line to the persistent log file, opening it lazily.
 // Must be called with j.mu held. Uses a buffered writer kept open for the task
 // lifetime — avoids thousands of open/close syscalls during high-frequency logs.
 func (j *jobState) writeLogLineLocked(line string) {
 	if j.logFile == nil {
 		f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
 		if err != nil {
 			return
 		}
 		j.logFile = f
 		j.logBuf = bufio.NewWriterSize(f, 64*1024)
 	}
 	_, _ = j.logBuf.WriteString(line + "\n")
 }
 // closeLog flushes and closes the log file. Called after all task output is done.
 func (j *jobState) closeLog() {
 	j.mu.Lock()
 	defer j.mu.Unlock()
 	if j.logBuf != nil {
 		_ = j.logBuf.Flush()
 	}
 	if j.logFile != nil {
 		_ = j.logFile.Close()
 		j.logFile = nil
 		j.logBuf = nil
 	}
 }
 func (j *jobState) finish(errMsg string) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
@@ -119,7 +170,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	if logPath == "" {
 		return j
 	}
-	data, err := os.ReadFile(logPath)
+	data, err := readTaskLogFile(logPath)
 	if err != nil || len(data) == 0 {
 		return j
 	}
--- a/audit/internal/webui/layout.go
+++ b/audit/internal/webui/layout.go
@@ -0,0 +1,137 @@
 package webui
 import (
 	"fmt"
 	"html"
 	"os"
 	"strings"
 )
 func layoutHead(title string) string {
 	return `<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="utf-8">
 <meta name="viewport" content="width=device-width,initial-scale=1">
 <title>` + html.EscapeString(title) + `</title>
 <style>
 :root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
 *{box-sizing:border-box;margin:0;padding:0}
 body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
 a{color:var(--accent);text-decoration:none}
 /* Sidebar */
 .sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
 .sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
 .sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
 .sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
 .sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
 .sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
 .sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
 .nav{flex:1}
 .nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
 .nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
 .nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
 /* Content */
 .main{flex:1;display:flex;flex-direction:column;overflow:auto}
 .topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
 .topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
 .content{padding:24px;flex:1}
 /* Cards */
 .card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
 .card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
 .card-head-actions{justify-content:space-between}
 .card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
 .card-body{padding:16px}
 /* Buttons */
 .btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
 .btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
 .btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
 .btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
 .btn-sm{padding:5px 10px;font-size:12px}
 /* Tables */
 table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
 th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
 td{padding:9px 14px;border-top:1px solid var(--border-lite)}
 tr:first-child td{border-top:0}
 tbody tr:hover td{background:rgba(0,0,0,.03)}
 /* Status badges */
 .badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
 .badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
 .badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
 .badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
 .badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
 /* Component chips — one small square per device */
 .chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
 .chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
 .chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
 .chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
 .chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
 .chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
 /* Output terminal */
 .terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
 .terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
 /* Forms */
 .form-row{margin-bottom:14px}
 .form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
 .form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
 .form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
 /* Grid */
 .grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
 .grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
 /* iframe viewer */
 .viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
 /* Alerts */
 .alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
 .alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
 .alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
 </style>
 </head>
 <body>
 `
 }
 func layoutNav(active string, buildLabel string) string {
 	items := []struct{ id, label, href, onclick string }{
 		{"dashboard", "Dashboard", "/", ""},
 		{"audit", "Audit", "/audit", ""},
 		{"validate", "Validate", "/validate", ""},
 		{"burn", "Burn", "/burn", ""},
 		{"benchmark", "Benchmark", "/benchmark", ""},
 		{"tasks", "Tasks", "/tasks", ""},
 		{"tools", "Tools", "/tools", ""},
 	}
 	var b strings.Builder
 	b.WriteString(`<aside class="sidebar">`)
 	b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
 	if strings.TrimSpace(buildLabel) == "" {
 		buildLabel = "dev"
 	}
 	b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
 	if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
 		gspMode := strings.TrimSpace(string(raw))
 		switch gspMode {
 		case "gsp-off":
 			b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
 		case "gsp-stuck":
 			b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
 		}
 	}
 	b.WriteString(`<nav class="nav">`)
 	for _, item := range items {
 		cls := "nav-item"
 		if item.id == active {
 			cls += " active"
 		}
 		if item.onclick != "" {
 			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
 				cls, item.href, item.onclick, item.label))
 		} else {
 			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
 				cls, item.href, item.label))
 		}
 	}
 	b.WriteString(`</nav>`)
 	b.WriteString(`</aside>`)
 	return b.String()
 }
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
  power_source TEXT,
  power_mode   TEXT,
  power_reason TEXT,
  PRIMARY KEY (ts)
 );
 CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
-	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+	if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
 		return err
 	}
 	if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
 		return err
 	}
 	return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
 }
 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	defer func() { _ = tx.Rollback() }()
 	_, err = tx.Exec(
-		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
-		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
 	)
 	if err != nil {
 		return err
@@ -161,14 +173,64 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	return tx.Commit()
 }
 // Downsample reduces density of old metrics rows to 1 sample per minute.
 // Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
 // affected — rows newer than downsampleBefore keep full 5-second resolution.
 // For each 60-second bucket the row with the smallest ts is kept; the rest
 // are deleted. This trims ~92 % of rows in that window while preserving
 // the overall shape of every chart.
 //
 // Called hourly by the metrics collector background goroutine.
 func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
 	if m == nil || m.db == nil {
 		return nil
 	}
 	start := deleteOlderThan.Unix()
 	end := downsampleBefore.Unix()
 	if end <= start {
 		return nil
 	}
 	// For each table: delete rows in [start, end) whose ts is NOT the minimum
 	// ts in its 60-second bucket (ts/60 integer division = bucket ID).
 	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
 		_, err := m.db.Exec(`
 DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
  AND ts NOT IN (
    SELECT MIN(ts) FROM `+table+`
    WHERE ts >= ? AND ts < ?
    GROUP BY ts / 60
  )`, start, end, start, end)
 		if err != nil {
 			return err
 		}
 	}
 	return nil
 }
 // Prune deletes all rows older than the given cutoff from every metrics table.
 // Called hourly by the metrics collector to keep the DB size bounded.
 func (m *MetricsDB) Prune(before time.Time) error {
 	if m == nil || m.db == nil {
 		return nil
 	}
 	cutTS := before.Unix()
 	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
 		if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
 			return err
 		}
 	}
 	_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
 	return nil
 }
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
 }
 // LoadAll returns all persisted samples in chronological order (oldest first).
 func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
 }
 // LoadBetween returns samples in chronological order within the given time window.
@@ -183,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
 		start, end = end, start
 	}
 	return m.loadSamples(
-		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
 		start.Unix(), end.Unix(),
 	)
 }
@@ -199,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	type sysRow struct {
 		ts            int64
 		cpu, mem, pwr float64
 		powerSource   string
 		powerMode     string
 		powerReason   string
 	}
 	var sysRows []sysRow
 	for rows.Next() {
 		var r sysRow
-		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
 			continue
 		}
 		sysRows = append(sysRows, r)
@@ -313,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	samples := make([]platform.LiveMetricSample, len(sysRows))
 	for i, r := range sysRows {
 		s := platform.LiveMetricSample{
-			Timestamp:  time.Unix(r.ts, 0).UTC(),
+			Timestamp:   time.Unix(r.ts, 0).UTC(),
-			CPULoadPct: r.cpu,
+			CPULoadPct:  r.cpu,
-			MemLoadPct: r.mem,
+			MemLoadPct:  r.mem,
-			PowerW:     r.pwr,
+			PowerW:      r.pwr,
 			PowerSource: r.powerSource,
 			PowerMode:   r.powerMode,
 			PowerReason: r.powerReason,
 		}
 		for _, idx := range gpuIndices {
 			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
--- a/audit/internal/webui/page_benchmark.go
+++ b/audit/internal/webui/page_benchmark.go
@@ -0,0 +1,613 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"os"
 	"path/filepath"
 	"sort"
 	"strconv"
 	"strings"
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 type benchmarkHistoryRun struct {
 	generatedAt   time.Time
 	displayTime   string
 	gpuScores     map[int]float64
 	gpuStatuses   map[int]string
 	overallStatus string
 }
 func renderBenchmark(opts HandlerOptions) string {
 	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 <div class="grid2">
  <div class="card">
    <div class="card-head">Benchmark Setup</div>
    <div class="card-body">
      <div class="form-row">
        <label>Profile</label>
        <select id="benchmark-profile">
          <option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
          <option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
          <option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
        </select>
      </div>
      <div class="form-row">
        <label>GPU Selection</label>
        <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
        </div>
        <div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
        </div>
      </div>
      <label class="benchmark-cb-row">
        <input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
        <span>Sequential — one GPU at a time</span>
      </label>
      <label class="benchmark-cb-row" id="benchmark-parallel-label">
        <input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
        <span>Parallel — all selected GPUs simultaneously</span>
      </label>
      <label class="benchmark-cb-row" id="benchmark-ramp-label">
        <input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
      </label>
      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
        <button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
      </div>
      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
      <div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
      <div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
    </div>
  </div>
  <div class="card">
    <div class="card-head">Method Split</div>
    <div class="card-body">
      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
      <table>
        <tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
        <tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
      </table>
      <p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
    </div>
  </div>
 </div>
 ` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
  <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
 </div>
 <style>
 .benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
 .benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 .benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
 .benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 </style>
 <script>
 let benchmarkES = null;
 function benchmarkTaskIDs(payload) {
  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
  if (payload && payload.task_id) return [payload.task_id];
  return [];
 }
 function benchmarkSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
    .map(function(el) { return parseInt(el.value, 10); })
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
 function benchmarkMode() {
  const el = document.querySelector('input[name="benchmark-mode"]:checked');
  return el ? el.value : 'sequential';
 }
 function benchmarkUpdateSelectionNote() {
  const selected = benchmarkSelectedGPUIndices();
  const perfBtn = document.getElementById('benchmark-run-performance-btn');
  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
  const note = document.getElementById('benchmark-selection-note');
  if (!selected.length) {
    perfBtn.disabled = true;
    fitBtn.disabled = true;
    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
    return;
  }
  perfBtn.disabled = false;
  fitBtn.disabled = false;
  const mode = benchmarkMode();
  if (mode === 'ramp-up') {
    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
  } else if (mode === 'parallel') {
    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
  } else {
    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
  }
 }
 function benchmarkRenderGPUList(gpus) {
  const root = document.getElementById('benchmark-gpu-list');
  if (!gpus || !gpus.length) {
    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
    benchmarkUpdateSelectionNote();
    return;
  }
  root.innerHTML = gpus.map(function(gpu) {
    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
    return '<label class="benchmark-gpu-row">'
      + '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  benchmarkApplyMultiGPUState(gpus.length);
  benchmarkUpdateSelectionNote();
 }
 function benchmarkApplyMultiGPUState(gpuCount) {
  var multiValues = ['parallel', 'ramp-up'];
  var radios = document.querySelectorAll('input[name="benchmark-mode"]');
  radios.forEach(function(el) {
    var isMulti = multiValues.indexOf(el.value) >= 0;
    if (gpuCount < 2 && isMulti) {
      el.disabled = true;
      if (el.checked) {
        var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
        if (seq) seq.checked = true;
      }
      var label = el.closest('label');
      if (label) label.style.opacity = '0.4';
    } else {
      el.disabled = false;
      if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
      var label = el.closest('label');
      if (label) label.style.opacity = '';
    }
  });
  benchmarkUpdateSelectionNote();
 }
 function benchmarkLoadGPUs() {
  const status = document.getElementById('benchmark-run-status');
  status.textContent = '';
  fetch('/api/gpu/nvidia').then(function(r) {
    return r.json().then(function(body) {
      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
      return body;
    });
  }).then(function(gpus) {
    benchmarkRenderGPUList(gpus);
  }).catch(function(err) {
    document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
    benchmarkUpdateSelectionNote();
  });
 }
 function benchmarkSelectAll() {
  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
  benchmarkUpdateSelectionNote();
 }
 function benchmarkSelectNone() {
  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
  benchmarkUpdateSelectionNote();
 }
 function runNvidiaBenchmark(kind) {
  const selected = benchmarkSelectedGPUIndices();
  const status = document.getElementById('benchmark-run-status');
  if (!selected.length) {
    status.textContent = 'Select at least one GPU.';
    return;
  }
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  const mode = benchmarkMode();
  const rampUp = mode === 'ramp-up' && selected.length > 1;
  const parallelGPUs = mode === 'parallel' && kind === 'performance';
  if (kind === 'power-fit' && mode === 'parallel') {
    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
    return;
  }
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
    run_nccl: kind === 'performance' && selected.length > 1,
    parallel_gpus: parallelGPUs,
    ramp_up: rampUp,
    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
  const term = document.getElementById('benchmark-terminal');
  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
  status.textContent = 'Queueing...';
  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
  fetch(endpoint, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(body)
  }).then(function(r) {
    return r.json().then(function(payload) {
      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
      return payload;
    });
  }).then(function(d) {
    const taskIds = benchmarkTaskIDs(d);
    if (!taskIds.length) throw new Error('No benchmark task was queued.');
    status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
    const streamNext = function(idx, failures) {
      if (idx >= taskIds.length) {
        status.textContent = failures ? 'Completed with failures.' : 'Completed.';
        return;
      }
      const taskId = taskIds[idx];
      term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
      benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
      benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
      benchmarkES.addEventListener('done', function(e) {
        benchmarkES.close();
        benchmarkES = null;
        if (e.data) failures += 1;
        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
        term.scrollTop = term.scrollHeight;
        const isLast = (idx + 1 >= taskIds.length);
        streamNext(idx + 1, failures);
        if (isLast) { benchmarkRefreshResults(); }
      });
      benchmarkES.onerror = function() {
        if (benchmarkES) {
          benchmarkES.close();
          benchmarkES = null;
        }
        term.textContent += '\nERROR: stream disconnected.\n';
        term.scrollTop = term.scrollHeight;
        streamNext(idx + 1, failures + 1);
      };
    };
    streamNext(0, 0);
  }).catch(function(err) {
    status.textContent = 'Error.';
    term.textContent += 'ERROR: ' + err.message + '\n';
  });
 }
 function benchmarkRenderAutotuneStatus(payload) {
  const el = document.getElementById('benchmark-autotune-status');
  if (!el) return;
  if (!payload || !payload.configured || !payload.config) {
    el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
    return;
  }
  const cfg = payload.config || {};
  const decision = payload.decision || {};
  const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
  const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
  const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
  const mode = decision.mode ? (' · mode ' + decision.mode) : '';
  el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
 }
 function loadBenchmarkAutotuneStatus() {
  fetch('/api/bee-bench/nvidia/autotune/status')
    .then(function(r) {
      return r.json().then(function(body) {
        if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
        return body;
      });
    })
    .then(function(body) { benchmarkRenderAutotuneStatus(body); })
    .catch(function(err) {
      const el = document.getElementById('benchmark-autotune-status');
      if (el) el.textContent = 'Autotune status error: ' + err.message;
    });
 }
 function runBenchmarkAutotune() {
  const selected = benchmarkSelectedGPUIndices();
  const status = document.getElementById('benchmark-run-status');
  const term = document.getElementById('benchmark-terminal');
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  document.getElementById('benchmark-output').style.display = 'block';
  document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
  term.textContent = 'Enqueuing benchmark autotune...\n';
  status.textContent = 'Queueing autotune...';
  fetch('/api/bee-bench/nvidia/autotune/run', {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify({
      profile: document.getElementById('benchmark-profile').value || 'standard',
      benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
      gpu_indices: selected
    })
  }).then(function(r) {
    return r.json().then(function(payload) {
      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
      return payload;
    });
  }).then(function(d) {
    const taskIds = benchmarkTaskIDs(d);
    if (!taskIds.length) throw new Error('No autotune task was queued.');
    const taskId = taskIds[0];
    status.textContent = 'Autotune queued: ' + taskId;
    benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
    benchmarkES.addEventListener('done', function(e) {
      if (benchmarkES) {
        benchmarkES.close();
        benchmarkES = null;
      }
      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
      status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
      loadBenchmarkAutotuneStatus();
    });
  }).catch(function(err) {
    status.textContent = 'Autotune error.';
    term.textContent += 'ERROR: ' + err.message + '\n';
  });
 }
 benchmarkLoadGPUs();
 loadBenchmarkAutotuneStatus();
 function benchmarkRefreshResults() {
  fetch('/api/benchmark/results')
    .then(function(r) { return r.text(); })
    .then(function(html) {
      const el = document.getElementById('benchmark-results-section');
      if (el) el.innerHTML = html;
    })
    .catch(function() {});
 }
 </script>`
 }
 func renderBenchmarkResultsCard(exportDir string) string {
 	maxIdx, runs := loadBenchmarkHistory(exportDir)
 	perf := renderBenchmarkResultsCardFromRuns(
 		"Perf Results",
 		"Composite score by saved benchmark run and GPU.",
 		"No saved performance benchmark runs yet.",
 		maxIdx,
 		runs,
 	)
 	power := renderPowerBenchmarkResultsCard(exportDir)
 	return perf + "\n" + power
 }
 func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
 	if len(runs) == 0 {
 		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
 	}
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
 	if strings.TrimSpace(description) != "" {
 		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
 	}
 	b.WriteString(`<div style="overflow-x:auto">`)
 	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
 	for i := 0; i <= maxGPUIndex; i++ {
 		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
 	}
 	b.WriteString(`</tr></thead><tbody>`)
 	for i, run := range runs {
 		b.WriteString(`<tr>`)
 		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
 		overallColor := "var(--ok)"
 		overallLabel := run.overallStatus
 		if overallLabel == "" {
 			overallLabel = "OK"
 		}
 		if overallLabel == "FAILED" {
 			overallColor = "var(--crit-fg,#9f3a38)"
 		} else if overallLabel != "OK" {
 			overallColor = "var(--warn)"
 		}
 		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
 		for idx := 0; idx <= maxGPUIndex; idx++ {
 			score, ok := run.gpuScores[idx]
 			if !ok {
 				b.WriteString(`<td style="color:var(--muted)">-</td>`)
 				continue
 			}
 			gpuStatus := run.gpuStatuses[idx]
 			scoreColor := ""
 			switch gpuStatus {
 			case "FAILED":
 				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
 			case "WARNING", "PARTIAL":
 				scoreColor = ` style="color:var(--warn);font-weight:600"`
 			case "", "OK":
 			default:
 				scoreColor = ` style="color:var(--warn);font-weight:600"`
 			}
 			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
 		}
 		b.WriteString(`</tr>`)
 	}
 	b.WriteString(`</tbody></table></div></div></div>`)
 	return b.String()
 }
 func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
 	baseDir := app.DefaultBeeBenchPerfDir
 	if strings.TrimSpace(exportDir) != "" {
 		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
 	}
 	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
 	if err != nil || len(paths) == 0 {
 		return -1, nil
 	}
 	sort.Strings(paths)
 	return loadBenchmarkHistoryFromPaths(paths)
 }
 func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
 	runs := make([]benchmarkHistoryRun, 0, len(paths))
 	maxGPUIndex := -1
 	for _, path := range paths {
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			continue
 		}
 		var result platform.NvidiaBenchmarkResult
 		if err := json.Unmarshal(raw, &result); err != nil {
 			continue
 		}
 		run := benchmarkHistoryRun{
 			generatedAt:   result.GeneratedAt,
 			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
 			gpuScores:     make(map[int]float64),
 			gpuStatuses:   make(map[int]string),
 			overallStatus: result.OverallStatus,
 		}
 		for _, gpu := range result.GPUs {
 			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
 			run.gpuStatuses[gpu.Index] = gpu.Status
 			if gpu.Index > maxGPUIndex {
 				maxGPUIndex = gpu.Index
 			}
 		}
 		runs = append(runs, run)
 	}
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
 	})
 	return maxGPUIndex, runs
 }
 func renderPowerBenchmarkResultsCard(exportDir string) string {
 	baseDir := app.DefaultBeeBenchPowerDir
 	if strings.TrimSpace(exportDir) != "" {
 		baseDir = filepath.Join(exportDir, "bee-bench", "power")
 	}
 	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
 	if err != nil || len(paths) == 0 {
 		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
 	}
 	sort.Strings(paths)
 	type powerRun struct {
 		generatedAt time.Time
 		displayTime string
 		result      platform.NvidiaPowerBenchResult
 	}
 	var runs []powerRun
 	for _, path := range paths {
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			continue
 		}
 		var r platform.NvidiaPowerBenchResult
 		if err := json.Unmarshal(raw, &r); err != nil {
 			continue
 		}
 		runs = append(runs, powerRun{
 			generatedAt: r.GeneratedAt,
 			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
 			result:      r,
 		})
 	}
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
 	})
 	var b strings.Builder
 	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
 	latest := runs[0].result
 	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
 	if latest.Hostname != "" {
 		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
 	}
 	if latest.OverallStatus != "" {
 		statusColor := "var(--ok)"
 		if latest.OverallStatus != "OK" {
 			statusColor = "var(--warn)"
 		}
 		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
 	}
 	b.WriteString(`</p>`)
 	if len(latest.GPUs) > 0 {
 		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
 		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
 		b.WriteString(`</tr></thead><tbody>`)
 		for _, gpu := range latest.GPUs {
 			finalLimitW := gpu.StablePowerLimitW
 			if finalLimitW <= 0 {
 				finalLimitW = gpu.AppliedPowerLimitW
 			}
 			derated := gpu.Derated ||
 				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
 			rowStyle := ""
 			finalStyle := ""
 			if derated {
 				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
 				finalStyle = ` style="color:#e6a000;font-weight:600"`
 			}
 			statusLabel := gpu.Status
 			if statusLabel == "" {
 				statusLabel = "OK"
 			}
 			statusColor := "var(--ok)"
 			if statusLabel == "FAILED" {
 				statusColor = "var(--crit-fg,#9f3a38)"
 			} else if statusLabel != "OK" {
 				statusColor = "var(--warn)"
 			}
 			nominalStr := "-"
 			if gpu.DefaultPowerLimitW > 0 {
 				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
 			}
 			singleStr := "-"
 			if gpu.AppliedPowerLimitW > 0 {
 				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
 			}
 			multiStr := "-"
 			if gpu.StablePowerLimitW > 0 {
 				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
 			}
 			p95Str := "-"
 			if gpu.MaxObservedPowerW > 0 {
 				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
 			}
 			b.WriteString(`<tr` + rowStyle + `>`)
 			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
 			b.WriteString(`<td>` + nominalStr + `</td>`)
 			b.WriteString(`<td>` + singleStr + `</td>`)
 			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
 			b.WriteString(`<td>` + p95Str + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
 			b.WriteString(`</tr>`)
 		}
 		b.WriteString(`</tbody></table></div>`)
 	}
 	if len(runs) > 1 {
 		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
 		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
 		for i, run := range runs {
 			statusColor := "var(--ok)"
 			if run.result.OverallStatus != "OK" {
 				statusColor = "var(--warn)"
 			}
 			b.WriteString(`<tr>`)
 			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
 			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
 			b.WriteString(`</tr>`)
 		}
 		b.WriteString(`</tbody></table></div></details>`)
 	}
 	b.WriteString(`</div></div>`)
 	return b.String()
 }
--- a/audit/internal/webui/page_burn.go
+++ b/audit/internal/webui/page_burn.go
@@ -0,0 +1,383 @@
 package webui
 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
 <div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Burn Profile</div>
  <div class="card-body burn-profile-body">
    <div class="burn-profile-col">
      <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
      <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
      <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
    </div>
    <div class="burn-profile-col burn-profile-action">
      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
      <p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
    </div>
    <div class="burn-profile-col burn-profile-action">
      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
    </div>
  </div>
  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
    <span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">NVIDIA GPU Selection</div>
  <div class="card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
    </div>
 	    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
 	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
 	    </div>
 	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
 	    <div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
 	      <label class="cb-row">
 	        <input type="radio" name="burn-nvidia-mode" value="sequential" checked>
 	        <span>Sequential — selected GPUs one at a time</span>
 	      </label>
 	      <label class="cb-row" id="burn-parallel-label">
 	        <input type="radio" name="burn-nvidia-mode" value="parallel">
 	        <span>Parallel — all selected GPUs simultaneously</span>
 	      </label>
 	      <label class="cb-row" id="burn-ramp-label">
 	        <input type="radio" name="burn-nvidia-mode" value="ramp-up">
 	        <span>Ramp-up — add one GPU at a time</span>
 	      </label>
 	    </div>
 	  </div>
 	</div>
 <div class="burn-section">Core Burn Paths</div>
 <div class="grid2 burn-grid" style="margin-bottom:16px">
 <div class="card burn-card">
  <div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
  <div class="card-body burn-card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
    <label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
    <label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
    <label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
    <label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
  </div>
 </div>
 <div class="card burn-card">
  <div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
  <div class="card-body burn-card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
    <label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
    <label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
    <label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
  </div>
 </div>
 </div>
 <div id="bi-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Output <span id="bi-title"></span></div>
  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
 </div>
 <style>
 .cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
 .cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 .cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
 .cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
 .cb-note { font-size:11px; color:var(--muted); font-style:italic; }
 .burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
 .burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
 .burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
 .burn-profile-col { min-width:0; }
 .burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
 .burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
 .burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
 .burn-grid { align-items:stretch; }
 .burn-card { height:100%; display:flex; flex-direction:column; }
 .burn-card-body { flex:1; display:flex; flex-direction:column; }
 .card-head-actions { justify-content:space-between; }
 .card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
 </style>
 <script>
 let biES = null;
 function burnTaskIDs(payload) {
  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
  if (payload && payload.task_id) return [payload.task_id];
  return [];
 }
 function burnProfile() {
  const selected = document.querySelector('input[name="burn-profile"]:checked');
  return selected ? selected.value : 'smoke';
 }
 function burnSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
    .map(function(el) { return parseInt(el.value, 10); })
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
 function burnNvidiaMode() {
  const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
  return el ? el.value : 'sequential';
 }
 function burnApplyMultiGPUState(gpuCount) {
  var multiValues = ['parallel', 'ramp-up'];
  var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
  radios.forEach(function(el) {
    var isMulti = multiValues.indexOf(el.value) >= 0;
    if (gpuCount < 2 && isMulti) {
      el.disabled = true;
      if (el.checked) {
        var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
        if (seq) seq.checked = true;
      }
      var label = el.closest('label');
      if (label) label.style.opacity = '0.4';
    } else {
      el.disabled = false;
      var label = el.closest('label');
      if (label) label.style.opacity = '';
    }
  });
 }
 function burnUpdateSelectionNote() {
  const note = document.getElementById('burn-selection-note');
  const selected = burnSelectedGPUIndices();
  if (!selected.length) {
    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
    return;
  }
  note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
 }
 function burnRenderGPUList(gpus) {
  const root = document.getElementById('burn-gpu-list');
  if (!gpus || !gpus.length) {
    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
    burnUpdateSelectionNote();
    return;
  }
  root.innerHTML = gpus.map(function(gpu) {
    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
    return '<label class="burn-gpu-row">'
      + '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  burnApplyMultiGPUState(gpus.length);
  burnUpdateSelectionNote();
 }
 function burnSelectAll() {
  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
  burnUpdateSelectionNote();
 }
 function burnSelectNone() {
  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
  burnUpdateSelectionNote();
 }
 function burnLoadGPUs() {
  fetch('/api/gpu/nvidia').then(function(r) {
    return r.json().then(function(body) {
      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
      return body;
    });
  }).then(function(gpus) {
    burnRenderGPUList(gpus);
  }).catch(function(err) {
    document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
    burnUpdateSelectionNote();
  });
 }
 function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
  const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
  if (useSelectedNvidia) {
    const selected = burnSelectedGPUIndices();
    if (!selected.length) {
      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
    }
    body.gpu_indices = selected;
    const bMode = burnNvidiaMode();
    if (bMode === 'ramp-up' && selected.length > 1) {
      body.stagger_gpu_start = true;
    } else if (bMode === 'parallel' && selected.length > 1) {
      body.parallel_gpus = true;
    }
  }
  return fetch('/api/sat/' + target + '/run', {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(body)
  }).then(function(r) {
    return r.json().then(function(payload) {
      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
      return payload;
    });
  });
 }
 function streamTask(taskId, label) {
  if (biES) { biES.close(); biES = null; }
  document.getElementById('bi-output').style.display = 'block';
  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
  const term = document.getElementById('bi-terminal');
  term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
  biES = new EventSource('/api/tasks/' + taskId + '/stream');
  biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
  biES.addEventListener('done', function(e) {
    biES.close();
    biES = null;
    term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
    term.scrollTop = term.scrollHeight;
  });
 }
 function streamBurnTask(taskId, label, resetTerminal) {
  return streamBurnTaskSet([taskId], label, resetTerminal);
 }
 function streamBurnTaskSet(taskIds, label, resetTerminal) {
  if (biES) { biES.close(); biES = null; }
  document.getElementById('bi-output').style.display = 'block';
  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
  const term = document.getElementById('bi-terminal');
  if (resetTerminal) {
    term.textContent = '';
  }
  if (!Array.isArray(taskIds) || !taskIds.length) {
    term.textContent += 'ERROR: no tasks queued.\n';
    return Promise.resolve({ok:false, error:'no tasks queued'});
  }
  const streamNext = function(idx, failures) {
    if (idx >= taskIds.length) {
      return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
    }
    const taskId = taskIds[idx];
    term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
    return new Promise(function(resolve) {
      biES = new EventSource('/api/tasks/' + taskId + '/stream');
      biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
      biES.addEventListener('done', function(e) {
        biES.close();
        biES = null;
        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
        term.scrollTop = term.scrollHeight;
        resolve(failures + (e.data ? 1 : 0));
      });
      biES.onerror = function() {
        if (biES) {
          biES.close();
          biES = null;
        }
        term.textContent += '\nERROR: stream disconnected.\n';
        term.scrollTop = term.scrollHeight;
        resolve(failures + 1);
      };
    }).then(function(nextFailures) {
      return streamNext(idx + 1, nextFailures);
    });
  };
  return streamNext(0, 0);
 }
 function runBurnTaskSet(tasks, statusElId) {
  const enabled = tasks.filter(function(t) {
    const el = document.getElementById(t.id);
    return el && el.checked && !el.disabled;
  });
  const status = statusElId ? document.getElementById(statusElId) : null;
  if (status) status.textContent = '';
  if (!enabled.length) {
    if (status) status.textContent = 'No tasks selected.';
    return;
  }
  const term = document.getElementById('bi-terminal');
  document.getElementById('bi-output').style.display = 'block';
  document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
  term.textContent = '';
  const runNext = function(idx) {
    if (idx >= enabled.length) {
      if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
      return Promise.resolve();
    }
    const t = enabled[idx];
    term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
    if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
    return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
      .then(function(d) {
        return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
      })
      .then(function() {
        return runNext(idx + 1);
      })
      .catch(function(err) {
        if (status) status.textContent = 'Error: ' + err.message;
        document.getElementById('bi-output').style.display = 'block';
        term.textContent += 'ERROR: ' + err.message + '\n';
        return Promise.reject(err);
      });
  };
  return runNext(0);
 }
 function runPlatformStress() {
  const comps = [];
  const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
  const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
  const hasChecked = function(ids) {
    return ids.some(function(id) {
      const el = document.getElementById(id);
      return el && el.checked && !el.disabled;
    });
  };
  if (hasChecked(computeIDs)) comps.push('cpu');
  if (hasChecked(gpuIDs)) comps.push('gpu');
  if (!comps.length) {
    const status = document.getElementById('burn-all-status');
    if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
    return;
  }
  const extra = comps.length > 0 ? {platform_components: comps} : {};
  enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
    streamTask(d.task_id, 'Platform Thermal Cycling');
  });
 }
 function runAllBurnTasks() {
  const status = document.getElementById('burn-all-status');
  const all = [
    {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
    {id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
    {id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
    {id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
    {id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
    {id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
    {id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
  ];
  status.textContent = 'Enqueuing...';
  runBurnTaskSet(all, 'burn-all-status');
 }
 fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
  const map = {
    'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
    'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
    'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
    'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
  };
  tools.forEach(function(t) {
    const spec = map[t.id];
    if (!spec) return;
    const cb = document.getElementById(spec.cb);
    const note = document.getElementById(spec.note);
    if (!cb) return;
    if (t.available) {
      cb.disabled = false;
    } else if (note) {
      note.textContent = '— ' + spec.reason;
    }
  });
 }).catch(function() {});
 burnLoadGPUs();
 </script>`
 }
--- a/audit/internal/webui/page_export_tools.go
+++ b/audit/internal/webui/page_export_tools.go
@@ -0,0 +1,434 @@
 package webui
 import (
 	"fmt"
 	"html"
 	"net/url"
 	"os"
 	"path/filepath"
 	"sort"
 	"strings"
 )
 func renderExport(exportDir string) string {
 	entries, _ := listExportFiles(exportDir)
 	var rows strings.Builder
 	for _, e := range entries {
 		rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
 			url.QueryEscape(e), html.EscapeString(e)))
 	}
 	if len(entries) == 0 {
 		rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
 	}
 	return `<div class="grid2">
 <div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
 <p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
 ` + renderSupportBundleInline() + `
 </div></div>
 <div class="card"><div class="card-head">Export Files</div><div class="card-body">
 <table><tr><th>File</th></tr>` + rows.String() + `</table>
 </div></div>
 </div>
 ` + renderUSBExportCard()
 }
 func listExportFiles(exportDir string) ([]string, error) {
 	var entries []string
 	err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
 		if err != nil {
 			return err
 		}
 		if info.IsDir() {
 			return nil
 		}
 		rel, err := filepath.Rel(exportDir, path)
 		if err != nil {
 			return err
 		}
 		entries = append(entries, rel)
 		return nil
 	})
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
 	sort.Strings(entries)
 	return entries, nil
 }
 func renderSupportBundleInline() string {
 	return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">&#8595; Download Support Bundle</button>
 <div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
 <script>
 window.supportBundleDownload = function() {
  var btn = document.getElementById('support-bundle-btn');
  var status = document.getElementById('support-bundle-status');
  btn.disabled = true;
  btn.textContent = 'Building...';
  status.textContent = 'Collecting logs and export data\u2026';
  status.style.color = 'var(--muted)';
  var filename = 'bee-support.tar.gz';
  fetch('/export/support.tar.gz')
    .then(function(r) {
      if (!r.ok) throw new Error('HTTP ' + r.status);
      var cd = r.headers.get('Content-Disposition') || '';
      var m = cd.match(/filename="?([^";]+)"?/);
      if (m) filename = m[1];
      return r.blob();
    })
    .then(function(blob) {
      var url = URL.createObjectURL(blob);
      var a = document.createElement('a');
      a.href = url;
      a.download = filename;
      document.body.appendChild(a);
      a.click();
      document.body.removeChild(a);
      URL.revokeObjectURL(url);
      status.textContent = 'Download started.';
      status.style.color = 'var(--ok-fg)';
    })
    .catch(function(e) {
      status.textContent = 'Error: ' + e.message;
      status.style.color = 'var(--crit-fg)';
    })
    .finally(function() {
      btn.disabled = false;
      btn.textContent = '\u2195 Download Support Bundle';
    });
 };
 </script>`
 }
 func renderUSBExportCard() string {
 	return `<div class="card" style="margin-top:16px">
  <div class="card-head">Export to USB
    <button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">&#8635; Refresh</button>
  </div>
  <div class="card-body">` + renderUSBExportInline() + `</div>
 </div>`
 }
 func renderUSBExportInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
 <div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
 <div id="usb-targets" style="margin-top:12px"></div>
 <div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
 <script>
 (function(){
 function usbRefresh() {
  document.getElementById('usb-status').textContent = 'Scanning...';
  document.getElementById('usb-targets').innerHTML = '';
  document.getElementById('usb-msg').textContent = '';
  fetch('/api/export/usb').then(r=>r.json()).then(targets => {
    window._usbTargets = Array.isArray(targets) ? targets : [];
    const st = document.getElementById('usb-status');
    const ct = document.getElementById('usb-targets');
    if (!targets || targets.length === 0) {
      st.textContent = 'No removable USB devices found.';
      return;
    }
    st.textContent = targets.length + ' device(s) found:';
    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
      targets.map((t, idx) => {
        const dev = t.device || '';
        const label = t.label || '';
        const model = t.model || '';
        return '<tr>' +
          '<td style="font-family:monospace">'+dev+'</td>' +
          '<td>'+t.fs_type+'</td>' +
          '<td>'+t.size+'</td>' +
          '<td>'+label+'</td>' +
          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
          '<td style="white-space:nowrap">' +
            '<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
            '<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
            '<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
          '</td></tr>';
      }).join('') + '</table>';
  }).catch(e => {
    document.getElementById('usb-status').textContent = 'Error: ' + e;
  });
 }
 window.usbExport = function(type, targetIndex, btn) {
  const target = (window._usbTargets || [])[targetIndex];
  if (!target) {
    const msg = document.getElementById('usb-msg');
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: USB target not found. Refresh and try again.';
    return;
  }
  const msg = document.getElementById('usb-msg');
  const row = btn ? btn.closest('td') : null;
  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
  const originalText = btn ? btn.textContent : '';
  if (btn) {
    btn.disabled = true;
    btn.textContent = 'Exporting...';
  }
  if (rowMsg) {
    rowMsg.style.color = 'var(--muted)';
    rowMsg.textContent = 'Working...';
  }
  msg.style.color = 'var(--muted)';
  msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
  fetch('/api/export/usb/'+type, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(target)
  }).then(async r => {
    const d = await r.json();
    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
    return d;
  }).then(d => {
    msg.style.color = 'var(--ok,green)';
    msg.textContent = d.message || 'Done.';
    if (rowMsg) {
      rowMsg.style.color = 'var(--ok,green)';
      rowMsg.textContent = d.message || 'Done.';
    }
  }).catch(e => {
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: '+e;
    if (rowMsg) {
      rowMsg.style.color = 'var(--err,red)';
      rowMsg.textContent = 'Error: ' + e;
    }
  }).finally(() => {
    if (btn) {
      btn.disabled = false;
      btn.textContent = originalText;
    }
  });
 };
 window.usbRefresh = usbRefresh;
 usbRefresh();
 })();
 </script>`
 }
 func renderNvidiaSelfHealInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
 <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
  <button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
  <button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">&#8635; Refresh</button>
 </div>
 <div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
 <div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
    <span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
    <span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
  </div>
  <div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
 </div>
 <script>
 function nvidiaSelfHealShowResult(label, status, output) {
  var out = document.getElementById('nvidia-self-heal-out');
  var term = document.getElementById('nvidia-self-heal-terminal');
  var statusEl = document.getElementById('nvidia-self-heal-out-status');
  var labelEl = document.getElementById('nvidia-self-heal-out-label');
  out.style.display = 'block';
  labelEl.textContent = label;
  term.textContent = output || '(no output)';
  term.scrollTop = term.scrollHeight;
  if (status === 'ok') {
    statusEl.textContent = '✓ done';
    statusEl.style.color = 'var(--ok-fg, #2c662d)';
  } else {
    statusEl.textContent = '✗ failed';
    statusEl.style.color = 'var(--crit-fg, #9f3a38)';
  }
 }
 function nvidiaRestartDrivers() {
  var btn = document.getElementById('nvidia-restart-btn');
  var original = btn.textContent;
  btn.disabled = true;
  btn.textContent = 'Restarting...';
  nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
  fetch('/api/services/action', {
    method:'POST',
    headers:{'Content-Type':'application/json'},
    body:JSON.stringify({name:'bee-nvidia', action:'restart'})
  }).then(r=>r.json()).then(d => {
    nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
    setTimeout(function() {
      loadServices();
      loadNvidiaSelfHeal();
    }, 800);
  }).catch(e => {
    nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
  }).finally(() => {
    btn.disabled = false;
    btn.textContent = original;
  });
 }
 function nvidiaResetGPU(index, btn) {
  var original = btn.textContent;
  btn.disabled = true;
  btn.textContent = 'Resetting...';
  nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
  fetch('/api/gpu/nvidia-reset', {
    method:'POST',
    headers:{'Content-Type':'application/json'},
    body:JSON.stringify({index:index})
  }).then(r=>r.json()).then(d => {
    nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
    setTimeout(loadNvidiaSelfHeal, 1000);
  }).catch(e => {
    nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
  }).finally(() => {
    btn.disabled = false;
    btn.textContent = original;
  });
 }
 function loadNvidiaSelfHeal() {
  var status = document.getElementById('nvidia-self-heal-status');
  var table = document.getElementById('nvidia-self-heal-table');
  status.textContent = 'Loading NVIDIA GPU status...';
  status.style.color = 'var(--muted)';
  table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
  fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
    if (!Array.isArray(gpus) || gpus.length === 0) {
      status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
      table.innerHTML = '';
      return;
    }
    status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
    const rows = gpus.map(g => {
      const serial = g.serial || '';
      const bdf = g.bdf || '';
      const id = serial || bdf || ('gpu-' + g.index);
      const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
      const details = [];
      if (serial) details.push('serial ' + serial);
      if (bdf) details.push('bdf ' + bdf);
      if (g.parse_failure && g.raw_line) details.push(g.raw_line);
      return '<tr>'
        + '<td style="white-space:nowrap">' + g.index + '</td>'
        + '<td>' + (g.name || 'unknown') + '</td>'
        + '<td style="font-family:monospace">' + id + '</td>'
        + '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
        + (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
        + '</td>'
        + '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
        + '</tr>';
    }).join('');
    table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
  }).catch(e => {
    status.textContent = 'Error loading NVIDIA GPU status: ' + e;
    status.style.color = 'var(--crit-fg, #9f3a38)';
    table.innerHTML = '';
  });
 }
 loadNvidiaSelfHeal();
 </script>`
 }
 func renderTools() string {
 	return `<div class="card" style="margin-bottom:16px">
  <div class="card-head">System Install</div>
  <div class="card-body">
    <div style="margin-bottom:20px">
    <div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
    <p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
    <p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
    <button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">&#9654; Copy to RAM</button>
    </div>
    <div style="border-top:1px solid var(--line);padding-top:20px">
    <div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
 		renderInstallInline() + `
    </div>
  </div>
 </div>
 <script>
 fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
  const boot = document.getElementById('boot-source-text');
  const txt = document.getElementById('ram-status-text');
  const btn = document.getElementById('ram-install-btn');
  let source = d.device || d.source || 'unknown source';
  let kind = d.kind || 'unknown';
  let label = source;
  if (kind === 'ram') label = 'RAM';
  else if (kind === 'usb') label = 'USB (' + source + ')';
  else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
  else if (kind === 'disk') label = 'disk (' + source + ')';
  else label = source;
  boot.textContent = 'Current boot source: ' + label + '.';
  txt.textContent = d.message || 'Checking...';
  if (d.status === 'ok' || d.in_ram) {
    txt.style.color = 'var(--ok, green)';
  } else if (d.status === 'failed') {
    txt.style.color = 'var(--err, #b91c1c)';
  } else {
    txt.style.color = 'var(--muted)';
  }
  if (d.can_start_task) {
    btn.style.display = '';
    btn.disabled = false;
  } else {
    btn.style.display = 'none';
  }
 });
 function installToRAM() {
  document.getElementById('ram-install-btn').disabled = true;
  fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
    window.location.href = '/tasks#' + d.task_id;
  });
 }
 </script>
 <div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
 <p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
 ` + renderSupportBundleInline() + `
 <div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
  <div style="font-weight:600;margin-bottom:8px">Export to USB</div>
  ` + renderUSBExportInline() + `
 </div>
 </div></div>
 <div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
 <div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
 <div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
 		renderNvidiaSelfHealInline() + `</div></div>
 <div class="card"><div class="card-head">Network</div><div class="card-body">` +
 		renderNetworkInline() + `</div></div>
 <div class="card"><div class="card-head">Services</div><div class="card-body">` +
 		renderServicesInline() + `</div></div>
 <script>
 function checkTools() {
  document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
  fetch('/api/tools/check').then(r=>r.json()).then(tools => {
    const rows = tools.map(t =>
      '<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '&#10003; '+t.Path : '&#10007; missing')+'</span></td></tr>'
    ).join('');
    document.getElementById('tools-table').innerHTML =
      '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
  });
 }
 checkTools();
 </script>`
 }
 func renderExportIndex(exportDir string) (string, error) {
 	entries, err := listExportFiles(exportDir)
 	if err != nil {
 		return "", err
 	}
 	var body strings.Builder
 	body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
 	body.WriteString(`<h1>Bee Export Files</h1><ul>`)
 	for _, entry := range entries {
 		body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
 	}
 	if len(entries) == 0 {
 		body.WriteString(`<li>No export files found.</li>`)
 	}
 	body.WriteString(`</ul></body></html>`)
 	return body.String(), nil
 }
--- a/audit/internal/webui/page_install_tasks.go
+++ b/audit/internal/webui/page_install_tasks.go
@@ -0,0 +1,314 @@
 package webui
 func renderInstallInline() string {
 	return `
    <div class="alert alert-warn" style="margin-bottom:16px">
      <strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
      disk and write the live system onto it. All existing data on the target disk will be lost.
      This operation cannot be undone.
    </div>
    <div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
    <div id="install-disk-section" style="display:none">
      <div class="card" style="margin-bottom:0">
        <table id="install-disk-table">
          <thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
          <tbody id="install-disk-tbody"></tbody>
        </table>
      </div>
      <div style="margin-top:12px">
        <button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
      </div>
    </div>
    <div id="install-confirm-section" style="display:none;margin-top:20px">
      <div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
      <div class="form-row" style="max-width:360px">
        <label>Type the device name to confirm (e.g. /dev/sda)</label>
        <input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
      </div>
      <button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
      <button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
    </div>
    <div id="install-progress-section" style="display:none;margin-top:20px">
      <div class="card-head" style="margin-bottom:8px">Installation Progress</div>
      <div id="install-terminal" class="terminal" style="max-height:500px"></div>
      <div id="install-status" style="margin-top:12px;font-size:13px"></div>
    </div>
 <style>
 #install-disk-tbody tr{cursor:pointer}
 #install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
 #install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
 </style>
 <script>
 var _installSelected = null;
 function installRefreshDisks() {
  document.getElementById('install-loading').style.display = '';
  document.getElementById('install-disk-section').style.display = 'none';
  document.getElementById('install-confirm-section').style.display = 'none';
  _installSelected = null;
  fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
    document.getElementById('install-loading').style.display = 'none';
    var tbody = document.getElementById('install-disk-tbody');
    tbody.innerHTML = '';
    if (!disks || disks.length === 0) {
      tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
    } else {
      disks.forEach(function(d) {
        var warnings = (d.warnings || []);
        var statusHtml;
        if (warnings.length === 0) {
          statusHtml = '<span class="badge badge-ok">OK</span>';
        } else {
          var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
          statusHtml = warnings.map(function(w){
            var cls = hasSmall ? 'badge-err' : 'badge-warn';
            return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'&quot;') + '">' +
              (w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
          }).join(' ');
        }
        var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
          ? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
        var tr = document.createElement('tr');
        tr.dataset.device = d.device;
        tr.dataset.model = d.model || 'Unknown';
        tr.dataset.size = d.size;
        tr.dataset.warnings = JSON.stringify(warnings);
        tr.innerHTML =
          '<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
          '<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
          '<td>' + (d.model || '—') + '</td>' +
          '<td>' + d.size + '</td>' +
          '<td>' + statusHtml + '</td>';
        tr.addEventListener('click', function(){ installSelectDisk(this); });
        tbody.appendChild(tr);
      });
    }
    document.getElementById('install-disk-section').style.display = '';
  }).catch(function(e){
    document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
  });
 }
 function installSelectDisk(tr) {
  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
  tr.classList.add('selected');
  var radio = tr.querySelector('input[type=radio]');
  if (radio) radio.checked = true;
  _installSelected = {
    device: tr.dataset.device,
    model: tr.dataset.model,
    size: tr.dataset.size,
    warnings: JSON.parse(tr.dataset.warnings || '[]')
  };
  var warnBox = document.getElementById('install-confirm-warn');
  var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
    ' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
    ' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
  if (_installSelected.warnings.length > 0) {
    warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
  }
  warnBox.innerHTML = warnLines;
  document.getElementById('install-confirm-input').value = '';
  document.getElementById('install-start-btn').disabled = true;
  document.getElementById('install-confirm-section').style.display = '';
  document.getElementById('install-progress-section').style.display = 'none';
 }
 function installDeselect() {
  _installSelected = null;
  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
  document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
  document.getElementById('install-confirm-section').style.display = 'none';
 }
 function installCheckConfirm() {
  var val = document.getElementById('install-confirm-input').value.trim();
  var ok = _installSelected && val === _installSelected.device;
  document.getElementById('install-start-btn').disabled = !ok;
 }
 function installStart() {
  if (!_installSelected) return;
  document.getElementById('install-confirm-section').style.display = 'none';
  document.getElementById('install-disk-section').style.display = 'none';
  document.getElementById('install-loading').style.display = 'none';
  var prog = document.getElementById('install-progress-section');
  var term = document.getElementById('install-terminal');
  var status = document.getElementById('install-status');
  prog.style.display = '';
  term.textContent = '';
  status.textContent = 'Starting installation…';
  status.style.color = 'var(--muted)';
  fetch('/api/install/run', {
    method: 'POST',
    headers: {'Content-Type': 'application/json'},
    body: JSON.stringify({device: _installSelected.device})
  }).then(function(r){
    return r.json().then(function(j){
      if (!r.ok) throw new Error(j.error || r.statusText);
      return j;
    });
  }).then(function(j){
    if (!j.task_id) throw new Error('missing task id');
    installStreamLog(j.task_id);
  }).catch(function(e){
    status.textContent = 'Error: ' + e;
    status.style.color = 'var(--crit-fg)';
  });
 }
 function installStreamLog(taskId) {
  var term = document.getElementById('install-terminal');
  var status = document.getElementById('install-status');
  var es = new EventSource('/api/tasks/' + taskId + '/stream');
  es.onmessage = function(e) {
    term.textContent += e.data + '\n';
    term.scrollTop = term.scrollHeight;
  };
  es.addEventListener('done', function(e) {
    es.close();
    if (!e.data) {
      status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
      var rebootBtn = document.createElement('button');
      rebootBtn.className = 'btn btn-primary btn-sm';
      rebootBtn.style.marginLeft = '12px';
      rebootBtn.textContent = 'Reboot now';
      rebootBtn.onclick = function(){
        fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
          body: JSON.stringify({name:'', action:'reboot'})});
      };
      status.appendChild(rebootBtn);
    } else {
      status.textContent = '✗ Installation failed: ' + e.data;
      status.style.color = 'var(--crit-fg)';
    }
  });
  es.onerror = function() {
    es.close();
    status.textContent = '✗ Stream disconnected.';
    status.style.color = 'var(--crit-fg)';
  };
 }
 installRefreshDisks();
 </script>
 `
 }
 func renderInstall() string {
 	return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
 		renderInstallInline() +
 		`</div></div>`
 }
 func renderTasks() string {
 	return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
 <button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
 <button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
 <span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
 <span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
 </div>
 <div class="card">
 <div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
 </div>
 <script>
 var _taskRefreshTimer = null;
 var _tasksAll = [];
 var _taskPage = 1;
 var _taskPageSize = 50;
 function loadTasks() {
  fetch('/api/tasks').then(r=>r.json()).then(tasks => {
    _tasksAll = Array.isArray(tasks) ? tasks : [];
    if (_tasksAll.length === 0) {
      _taskPage = 1;
      document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
      return;
    }
    const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
    if (_taskPage > totalPages) _taskPage = totalPages;
    if (_taskPage < 1) _taskPage = 1;
    const start = (_taskPage - 1) * _taskPageSize;
    const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
    const rows = pageTasks.map(t => {
      const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
      const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
      const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status;
      let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
      if (t.status === 'running' || t.status === 'pending') {
        actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
      }
      if (t.status === 'pending') {
        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">&#8679;</button>';
        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">&#8681;</button>';
      }
      return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
        '<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
        '<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
        '<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
        '<td>'+t.priority+'</td>' +
        '<td>'+actions+'</td></tr>';
    }).join('');
    const showingFrom = start + 1;
    const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
    const pager =
      '<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
        '<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
        '<div style="display:flex;align-items:center;gap:8px">' +
          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
          '<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
        '</div>' +
      '</div>';
    document.getElementById('tasks-table').innerHTML =
      '<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
  });
 }
 function escHtml(s) { return (s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/"/g,'&quot;'); }
 function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
 function formatDurSec(sec) {
  sec = Math.max(0, Math.round(sec||0));
  if (sec < 60) return sec+'s';
  const m = Math.floor(sec/60), ss = sec%60;
  return m+'m '+ss+'s';
 }
 function setTaskPage(page) {
  const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
  _taskPage = Math.min(totalPages, Math.max(1, page));
  loadTasks();
 }
 function cancelTask(id) {
  fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
 }
 function cancelAll() {
  fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
 }
 function killWorkers() {
  if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
  fetch('/api/tasks/kill-workers',{method:'POST'})
    .then(r=>r.json())
    .then(d=>{
      loadTasks();
      var toast = document.getElementById('kill-toast');
      var parts = [];
      if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
      if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
      toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
      toast.style.display = '';
      setTimeout(()=>{ toast.style.display='none'; }, 5000);
    });
 }
 function setPriority(id, delta) {
  fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
    .then(()=>loadTasks());
 }
 loadTasks();
 _taskRefreshTimer = setInterval(loadTasks, 2000);
 </script>`
 }
--- a/audit/internal/webui/page_metrics.go
+++ b/audit/internal/webui/page_metrics.go
@@ -0,0 +1,238 @@
 package webui
 func renderMetrics() string {
 	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Server — Load</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Temperature — CPU</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Temperature — Ambient Sensors</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
  </div>
 </div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Server — Power</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
  </div>
 </div>
 <div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
  <div class="card-head">Server — Fan RPM</div>
  <div class="card-body" style="padding:8px">
    <img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
  </div>
 </div>
 <section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
  <div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
    <div>
      <div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
      <div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
    </div>
    <label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
      <input id="gpu-chart-toggle" type="checkbox">
      <span>One chart per GPU</span>
    </label>
  </div>
  <div id="gpu-metrics-by-metric">
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Compute Load</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Memory Load</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Core Clock</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Power</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
      </div>
    </div>
    <div class="card" style="margin-bottom:16px">
      <div class="card-head">GPU — Temperature</div>
      <div class="card-body" style="padding:8px">
        <img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
      </div>
    </div>
  </div>
  <div id="gpu-metrics-by-gpu" style="display:none"></div>
 </section>
 <script>
 let gpuChartKey = '';
 const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
 let metricsNvidiaGPUsPromise = null;
 function loadMetricsNvidiaGPUs() {
  if (!metricsNvidiaGPUsPromise) {
    metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
      .then(function(r) {
        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
        return r.json();
      })
      .then(function(list) { return Array.isArray(list) ? list : []; })
      .catch(function() { return []; });
  }
  return metricsNvidiaGPUsPromise;
 }
 function metricsGPUNameMap(list) {
  const out = {};
  (list || []).forEach(function(gpu) {
    const idx = Number(gpu.index);
    if (!Number.isFinite(idx) || !gpu.name) return;
    out[idx] = gpu.name;
  });
  return out;
 }
 function metricsGPUDisplayLabel(idx, names) {
  const name = names && names[idx];
  return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
 }
 function loadGPUChartModePreference() {
  try {
    return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
  } catch (_) {
    return false;
  }
 }
 function saveGPUChartModePreference(perGPU) {
  try {
    sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
  } catch (_) {}
 }
 function refreshChartImage(el) {
  if (!el || el.dataset.loading === '1') return;
  if (el.offsetParent === null) return;
  const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
  const nextSrc = baseSrc + '?t=' + Date.now();
  const probe = new Image();
  el.dataset.baseSrc = baseSrc;
  el.dataset.loading = '1';
  probe.onload = function() {
    el.src = nextSrc;
    el.dataset.loading = '0';
  };
  probe.onerror = function() {
    el.dataset.loading = '0';
  };
  probe.src = nextSrc;
 }
 function refreshCharts() {
  document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
 }
 function gpuIndices(rows) {
  const seen = {};
  const out = [];
  (rows || []).forEach(function(row) {
    const idx = Number(row.index);
    if (!Number.isFinite(idx) || seen[idx]) return;
    seen[idx] = true;
    out.push(idx);
  });
  return out.sort(function(a, b) { return a - b; });
 }
 function renderGPUOverviewCards(indices, names) {
  const host = document.getElementById('gpu-metrics-by-gpu');
  if (!host) return;
  host.innerHTML = indices.map(function(idx) {
    const label = metricsGPUDisplayLabel(idx, names);
    return '<div class="card" style="margin-bottom:16px">' +
      '<div class="card-head">' + label + ' — Overview</div>' +
      '<div class="card-body" style="padding:8px">' +
      '<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
      '</div></div>';
  }).join('');
 }
 function applyGPUChartMode() {
  const perMetric = document.getElementById('gpu-metrics-by-metric');
  const perGPU = document.getElementById('gpu-metrics-by-gpu');
  const toggle = document.getElementById('gpu-chart-toggle');
  const gpuModePerGPU = !!(toggle && toggle.checked);
  if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
  if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
 }
 function syncMetricsLayout(d) {
  const fanCard = document.getElementById('card-server-fans');
  if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
  const section = document.getElementById('gpu-metrics-section');
  const summary = document.getElementById('gpu-metrics-summary');
  const indices = gpuIndices(d.gpus);
  loadMetricsNvidiaGPUs().then(function(gpus) {
    const names = metricsGPUNameMap(gpus);
    if (section) section.style.display = indices.length > 0 ? '' : 'none';
    if (summary) {
      summary.textContent = indices.length > 0
        ? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
        : 'No GPUs detected in live metrics.';
    }
    const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
    if (nextKey !== gpuChartKey) {
      renderGPUOverviewCards(indices, names);
      gpuChartKey = nextKey;
    }
    applyGPUChartMode();
  });
 }
 function loadMetricsLayout() {
  fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
 }
 const gpuChartToggle = document.getElementById('gpu-chart-toggle');
 if (gpuChartToggle) {
  gpuChartToggle.checked = loadGPUChartModePreference();
 }
 applyGPUChartMode();
 if (gpuChartToggle) {
  gpuChartToggle.addEventListener('change', function() {
    saveGPUChartModePreference(!!gpuChartToggle.checked);
    applyGPUChartMode();
    refreshCharts();
  });
 }
 loadMetricsLayout();
 setInterval(refreshCharts, 3000);
 setInterval(loadMetricsLayout, 5000);
 </script>`
 }
--- a/audit/internal/webui/page_network_services.go
+++ b/audit/internal/webui/page_network_services.go
@@ -0,0 +1,213 @@
 package webui
 import "html"
 // renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
 func renderNetworkInline() string {
 	return `<div id="net-pending" style="display:none" class="alert alert-warn">
 <strong>&#9888; Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
 <button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
 <button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
 </div>
 <div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div class="grid2" style="margin-top:16px">
 <div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
 <div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
 <button class="btn btn-primary" onclick="runDHCP()">&#9654; Run DHCP</button>
 <div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
 </div>
 <div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
 <div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
 <div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
 <div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
 <div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
 <div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
 <button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
 <div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
 </div>
 </div>
 <script>
 var _netCountdownTimer = null;
 var _netRefreshTimer = null;
 const NET_ROLLBACK_SECS = 60;
 function loadNetwork() {
  fetch('/api/network').then(r=>r.json()).then(d => {
    const rows = (d.interfaces||[]).map(i =>
      '<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
      '<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
      '<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
    ).join('');
    document.getElementById('iface-table').innerHTML =
      '<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
      (d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
    if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
    else hideNetPending();
  }).catch(function() {});
 }
 function selectIface(iface) {
  document.getElementById('dhcp-iface').value = iface;
  document.getElementById('st-iface').value = iface;
 }
 function toggleIface(iface, currentState) {
  showNetPending(NET_ROLLBACK_SECS);
  fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
    .then(r=>r.json()).then(d => {
      if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
      loadNetwork();
      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
    }).catch(function() {
      setTimeout(loadNetwork, 1500);
    });
 }
 function hideNetPending() {
  const el = document.getElementById('net-pending');
  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
  _netCountdownTimer = null;
  el.style.display = 'none';
 }
 function showNetPending(secs) {
  if (!secs || secs < 1) { hideNetPending(); return; }
  const el = document.getElementById('net-pending');
  el.style.display = 'block';
  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
  let remaining = secs;
  document.getElementById('net-countdown').textContent = remaining;
  _netCountdownTimer = setInterval(function() {
    remaining--;
    document.getElementById('net-countdown').textContent = remaining;
    if (remaining <= 0) { hideNetPending(); loadNetwork(); }
  }, 1000);
 }
 function confirmNetChange() {
  hideNetPending();
  fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
 }
 function rollbackNetChange() {
  hideNetPending();
  fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
 }
 function runDHCP() {
  const iface = document.getElementById('dhcp-iface').value.trim();
  showNetPending(NET_ROLLBACK_SECS);
  fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
    .then(r=>r.json()).then(d => {
      document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
      if (d.error) { hideNetPending(); return; }
      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
      loadNetwork();
    }).catch(function() {
      setTimeout(loadNetwork, 1500);
    });
 }
 function setStatic() {
  const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
  showNetPending(NET_ROLLBACK_SECS);
  fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
    interface: document.getElementById('st-iface').value,
    address: document.getElementById('st-addr').value,
    prefix: document.getElementById('st-prefix').value,
    gateway: document.getElementById('st-gw').value,
    dns: dns,
  })}).then(r=>r.json()).then(d => {
    document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
    if (d.error) { hideNetPending(); return; }
    showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
    loadNetwork();
  }).catch(function() {
    setTimeout(loadNetwork, 1500);
  });
 }
 loadNetwork();
 if (_netRefreshTimer) clearInterval(_netRefreshTimer);
 _netRefreshTimer = setInterval(loadNetwork, 5000);
 </script>`
 }
 func renderNetwork() string {
 	return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
 		renderNetworkInline() +
 		`</div></div>`
 }
 func renderServicesInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
 <div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
 <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="svc-out" style="display:none;margin-top:12px">
  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
    <span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
    <span id="svc-out-status" style="font-size:12px"></span>
  </div>
  <div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
 </div>
 <script>
 function loadServices() {
  fetch('/api/services').then(r=>r.json()).then(svcs => {
    const rows = svcs.map(s => {
      const st = s.state||'unknown';
      const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
      const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
      const body = (s.body||'').replace(/</g,'&lt;').replace(/>/g,'&gt;');
      return '<tr>' +
        '<td style="white-space:nowrap">'+s.name+'</td>' +
        '<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
        '<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
        '</td>' +
        '<td style="white-space:nowrap">' +
        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start"   onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop"    onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
        '</td></tr>';
    }).join('');
    document.getElementById('svc-table').innerHTML =
      '<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
  });
 }
 function toggleBody(id) {
  const el = document.getElementById(id);
  if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
 }
 function svcAction(btn, name, action) {
  var label = btn.textContent;
  btn.disabled = true;
  btn.textContent = '...';
  var out = document.getElementById('svc-out');
  var term = document.getElementById('svc-terminal');
  var statusEl = document.getElementById('svc-out-status');
  var labelEl = document.getElementById('svc-out-label');
  out.style.display = 'block';
  labelEl.textContent = action + ' ' + name;
  term.textContent = 'Running...';
  statusEl.textContent = '';
  statusEl.style.color = '';
  fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
    .then(r=>r.json()).then(d => {
      term.textContent = d.output || d.error || '(no output)';
      term.scrollTop = term.scrollHeight;
      if (d.status === 'ok') {
        statusEl.textContent = '✓ done';
        statusEl.style.color = 'var(--ok-fg, #2c662d)';
      } else {
        statusEl.textContent = '✗ failed';
        statusEl.style.color = 'var(--crit-fg, #9f3a38)';
      }
      btn.textContent = label;
      btn.disabled = false;
      setTimeout(loadServices, 800);
    }).catch(e => {
      term.textContent = 'Request failed: ' + e;
      statusEl.textContent = '✗ error';
      statusEl.style.color = 'var(--crit-fg, #9f3a38)';
      btn.textContent = label;
      btn.disabled = false;
    });
 }
 loadServices();
 </script>`
 }
 func renderServices() string {
 	return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
 		renderServicesInline() +
 		`</div></div>`
 }
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -0,0 +1,663 @@
 package webui
 import (
 	"encoding/json"
 	"fmt"
 	"html"
 	"sort"
 	"strings"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/schema"
 )
 type validateInventory struct {
 	CPU            string
 	Memory         string
 	Storage        string
 	NVIDIA         string
 	AMD            string
 	NvidiaGPUCount int
 	AMDGPUCount    int
 }
 func validateFmtDur(secs int) string {
 	if secs < 120 {
 		return fmt.Sprintf("~%d s", secs)
 	}
 	mins := (secs + 29) / 60
 	return fmt.Sprintf("~%d min", mins)
 }
 func validateTotalValidateSec(n int) int {
 	if n < 0 {
 		n = 0
 	}
 	total := platform.SATEstimatedCPUValidateSec +
 		platform.SATEstimatedMemoryValidateSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUValidateSec
 	}
 	return total
 }
 func validateTotalStressSec(n int) int {
 	if n < 0 {
 		n = 0
 	}
 	total := platform.SATEstimatedCPUStressSec +
 		platform.SATEstimatedMemoryStressSec +
 		platform.SATEstimatedNvidiaPulseTestSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUStressSec +
 			platform.SATEstimatedNvidiaTargetedStressSec +
 			platform.SATEstimatedNvidiaTargetedPowerSec
 	}
 	return total
 }
 func renderValidate(opts HandlerOptions) string {
 	inv := loadValidateInventory(opts)
 	n := inv.NvidiaGPUCount
 	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
 	stressTotalStr := validateFmtDur(validateTotalStressSec(n))
 	gpuNote := ""
 	if n > 0 {
 		gpuNote = fmt.Sprintf(" (%d GPU)", n)
 	}
 	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 	<div class="card" style="margin-bottom:16px">
 	  <div class="card-head">Validate Profile</div>
 	  <div class="card-body validate-profile-body">
 	    <div class="validate-profile-col">
 	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
 	    </div>
 	    <div class="validate-profile-col validate-profile-action">
 	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
 	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
 	      <div style="margin-top:12px">
 	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
 	      </div>
 	    </div>
 	  </div>
 	</div>
 <div class="grid3">
 ` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
 		inv.CPU,
 		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
 		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
 		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
 	)) +
 		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
 			inv.Memory,
 			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
 			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
 			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
 			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
 			`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">NVIDIA GPU Selection</div>
  <div class="card-body">
    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
    </div>
    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
    </div>
    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
  </div>
 </div>
 <div class="grid3">
 ` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
 		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
 			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
 			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
 		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
 		renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
 		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
 		renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
 			`<code>dcgmi diag pulse_test</code>`,
 			`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-interconnect">` +
 		renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
 			`<code>all_reduce_perf</code> (NCCL tests)`,
 			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-bandwidth">` +
 		renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
 			`<code>nvbandwidth</code>`,
 			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
 		)) +
 		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
 ` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
 		inv.AMD,
 		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
 		`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
 		`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
 	)) +
 		`</div>
 <div id="sat-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Test Output <span id="sat-title"></span></div>
  <div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
 </div>
 <style>
 .validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
 .validate-profile-col { min-width:0; display:flex; flex-direction:column; }
 .validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
 .validate-card-body { padding:0; }
 .validate-card-section { padding:12px 16px 0; }
 .validate-card-section:last-child { padding-bottom:16px; }
 .sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
 .sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
 </style>
 <script>
 let satES = null;
 function satStressMode() {
  return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
 }
 function satModeChanged() {
  const stress = satStressMode();
  [
    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
      card.style.opacity = stress ? '1' : '0.5';
      const hint = document.getElementById(item.hint);
      if (hint) hint.style.display = stress ? 'none' : '';
    }
  });
 }
 function satLabels() {
  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
  if (!satNvidiaGPUsPromise) {
    satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
      .then(r => {
        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
        return r.json();
      })
      .then(list => Array.isArray(list) ? list : []);
  }
  return satNvidiaGPUsPromise;
 }
 function satSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
    .map(function(el) { return parseInt(el.value, 10); })
    .filter(function(v) { return !Number.isNaN(v); })
    .sort(function(a, b) { return a - b; });
 }
 function satUpdateGPUSelectionNote() {
  const note = document.getElementById('sat-gpu-selection-note');
  if (!note) return;
  const selected = satSelectedGPUIndices();
  if (!selected.length) {
    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
    return;
  }
  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
 }
 function satRenderGPUList(gpus) {
  const root = document.getElementById('sat-gpu-list');
  if (!root) return;
  if (!gpus || !gpus.length) {
    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
    satUpdateGPUSelectionNote();
    return;
  }
  root.innerHTML = gpus.map(function(gpu) {
    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
    return '<label class="sat-gpu-row">'
      + '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
      + '</label>';
  }).join('');
  satUpdateGPUSelectionNote();
 }
 function satSelectAllGPUs() {
  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
  satUpdateGPUSelectionNote();
 }
 function satSelectNoGPUs() {
  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
  satUpdateGPUSelectionNote();
 }
 function satLoadGPUs() {
  loadSatNvidiaGPUs().then(function(gpus) {
    satRenderGPUList(gpus);
  }).catch(function(err) {
    const root = document.getElementById('sat-gpu-list');
    if (root) {
      root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
    }
    satUpdateGPUSelectionNote();
  });
 }
 function satGPUDisplayName(gpu) {
  const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
  const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
  return 'GPU ' + idx + ' — ' + name;
 }
 function satRequestBody(target, overrides) {
  const body = {};
  const labels = satLabels();
  body.display_name = labels[target] || ('Validate ' + target);
  body.stress_mode = satStressMode();
  if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
  if (overrides) {
    Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
  }
  return body;
 }
 function enqueueSATTarget(target, overrides) {
  return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
    .then(r => r.json());
 }
 function streamSATTask(taskId, title, resetTerminal) {
  if (satES) { satES.close(); satES = null; }
  document.getElementById('sat-output').style.display='block';
  document.getElementById('sat-title').textContent = '— ' + title;
  const term = document.getElementById('sat-terminal');
  if (resetTerminal) {
    term.textContent = '';
  }
  term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
  return new Promise(function(resolve) {
    satES = new EventSource('/api/tasks/' + taskId + '/stream');
    satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
    satES.addEventListener('done', function(e) {
      satES.close();
      satES = null;
      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
      term.scrollTop = term.scrollHeight;
      resolve({ok: !e.data, error: e.data || ''});
    });
    satES.onerror = function() {
      if (satES) {
        satES.close();
        satES = null;
      }
      term.textContent += '\nERROR: stream disconnected.\n';
      term.scrollTop = term.scrollHeight;
      resolve({ok: false, error: 'stream disconnected'});
    };
  });
 }
 function selectedAMDValidateTargets() {
  const targets = [];
  const gpu = document.getElementById('sat-amd-target');
  const mem = document.getElementById('sat-amd-mem-target');
  const bw = document.getElementById('sat-amd-bandwidth-target');
  if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
  if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
  if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
  return targets;
 }
 function runSAT(target) {
  return runSATWithOverrides(target, null);
 }
 function runSATWithOverrides(target, overrides) {
  const title = (overrides && overrides.display_name) || target;
  const term = document.getElementById('sat-terminal');
  document.getElementById('sat-output').style.display='block';
  document.getElementById('sat-title').textContent = '— ' + title;
  term.textContent = 'Enqueuing ' + title + ' test...\n';
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
 const nvidiaPerGPUTargets = [];
 const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
  return Promise.resolve(satSelectedGPUIndices());
 }
 function expandSATTarget(target) {
  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
    return satAllGPUIndicesForMulti().then(function(indices) {
      if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
      return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
    });
  }
  if (nvidiaPerGPUTargets.indexOf(target) < 0) {
    return Promise.resolve([{target: target}]);
  }
  const selected = satSelectedGPUIndices();
  if (!selected.length) {
    return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
  }
  return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
    target: target,
    overrides: {
      gpu_indices: [Number(gpu.index)],
      display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
    },
    label: satGPUDisplayName(gpu),
  })));
 }
 function runNvidiaFabricValidate(target) {
  satAllGPUIndicesForMulti().then(function(indices) {
    if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
    runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
  });
 }
 function runNvidiaValidateSet(target) {
  const selected = satSelectedGPUIndices();
  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
 }
 function runAMDValidateSet() {
  const targets = selectedAMDValidateTargets();
  if (!targets.length) return;
  if (targets.length === 1) return runSAT(targets[0]);
  document.getElementById('sat-output').style.display='block';
  document.getElementById('sat-title').textContent = '— amd';
  const term = document.getElementById('sat-terminal');
  term.textContent = 'Running AMD validate set one by one...\n';
  const labels = satLabels();
  const runNext = (idx) => {
    if (idx >= targets.length) return Promise.resolve();
    const target = targets[idx];
    term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
    return enqueueSATTarget(target)
      .then(d => {
        return streamSATTask(d.task_id, labels[target], false);
      }).then(function() {
        return runNext(idx + 1);
      });
  };
  return runNext(0);
 }
 function runAllSAT() {
  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
    const btn = document.getElementById('sat-btn-' + target);
    return !(btn && btn.disabled);
  });
  Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
    const expanded = [];
    for (let cycle = 0; cycle < cycles; cycle++) {
      groups.forEach(group => group.forEach(item => expanded.push(item)));
    }
    const total = expanded.length;
    let enqueued = 0;
    if (!total) {
      status.textContent = 'No tasks selected.';
      return;
    }
    const runNext = (idx) => {
      if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
      const item = expanded[idx];
      status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
      return enqueueSATTarget(item.target, item.overrides)
        .then(() => {
          enqueued++;
          return runNext(idx + 1);
        });
    };
    return runNext(0);
  }).catch(err => {
    status.textContent = 'Error: ' + err.message;
  });
 }
 </script>
 <script>
 fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
 });
 satLoadGPUs();
 function disableSATAMDOptions(reason) {
    ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
        const cb = document.getElementById(id);
        if (!cb) return;
        cb.disabled = true;
        cb.checked = false;
        cb.title = reason;
    });
 }
 function disableSATCard(id, reason) {
    const btn = document.getElementById('sat-btn-' + id);
    if (!btn) return;
    btn.disabled = true;
    btn.title = reason;
    btn.style.opacity = '0.4';
    const card = btn.closest('.card');
    if (card) {
        let note = card.querySelector('.sat-unavail');
        if (!note) {
            note = document.createElement('p');
            note.className = 'sat-unavail';
            note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
            const body = card.querySelector('.card-body');
            if (body) body.insertBefore(note, body.firstChild);
        }
        note.textContent = reason;
    }
 }
 </script>`
 }
 func loadValidateInventory(opts HandlerOptions) validateInventory {
 	unknown := "Audit snapshot not loaded."
 	out := validateInventory{
 		CPU:     unknown,
 		Memory:  unknown,
 		Storage: unknown,
 		NVIDIA:  unknown,
 		AMD:     unknown,
 	}
 	data, err := loadSnapshot(opts.AuditPath)
 	if err != nil {
 		return out
 	}
 	var snap schema.HardwareIngestRequest
 	if err := json.Unmarshal(data, &snap); err != nil {
 		return out
 	}
 	cpuCounts := map[string]int{}
 	cpuTotal := 0
 	for _, cpu := range snap.Hardware.CPUs {
 		if cpu.Present != nil && !*cpu.Present {
 			continue
 		}
 		cpuTotal++
 		addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
 	}
 	memCounts := map[string]int{}
 	memTotal := 0
 	for _, dimm := range snap.Hardware.Memory {
 		if dimm.Present != nil && !*dimm.Present {
 			continue
 		}
 		memTotal++
 		addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
 	}
 	storageCounts := map[string]int{}
 	storageTotal := 0
 	for _, dev := range snap.Hardware.Storage {
 		if dev.Present != nil && !*dev.Present {
 			continue
 		}
 		storageTotal++
 		addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
 	}
 	nvidiaCounts := map[string]int{}
 	nvidiaTotal := 0
 	amdCounts := map[string]int{}
 	amdTotal := 0
 	for _, dev := range snap.Hardware.PCIeDevices {
 		if dev.Present != nil && !*dev.Present {
 			continue
 		}
 		if validateIsVendorGPU(dev, "nvidia") {
 			nvidiaTotal++
 			addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
 		}
 		if validateIsVendorGPU(dev, "amd") {
 			amdTotal++
 			addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
 		}
 	}
 	out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
 	out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
 	out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
 	out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
 	out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
 	out.NvidiaGPUCount = nvidiaTotal
 	out.AMDGPUCount = amdTotal
 	return out
 }
 func renderValidateCardBody(devices, description, commands, settings string) string {
 	return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
 		`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
 		`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
 		`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
 }
 func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
 	if total == 0 {
 		return "0 " + unit + "s detected."
 	}
 	keys := make([]string, 0, len(models))
 	for key := range models {
 		keys = append(keys, key)
 	}
 	sort.Strings(keys)
 	parts := make([]string, 0, len(keys))
 	for _, key := range keys {
 		parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
 	}
 	label := unit
 	if total != 1 {
 		label += "s"
 	}
 	if len(parts) == 1 {
 		return parts[0] + " " + label
 	}
 	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
 }
 func addValidateModel(counts map[string]int, name string) {
 	name = strings.TrimSpace(name)
 	if name == "" {
 		name = "unknown"
 	}
 	counts[name]++
 }
 func validateTrimPtr(value *string) string {
 	if value == nil {
 		return ""
 	}
 	return strings.TrimSpace(*value)
 }
 func validateFirstNonEmpty(values ...string) string {
 	for _, value := range values {
 		value = strings.TrimSpace(value)
 		if value != "" {
 			return value
 		}
 	}
 	return ""
 }
 func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
 	model := strings.ToLower(validateTrimPtr(dev.Model))
 	manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
 	class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
 	if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
 		return false
 	}
 	switch vendor {
 	case "nvidia":
 		return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
 	case "amd":
 		isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
 		isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
 		isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
 		return isGPUClass && (isAMDVendor || isAMDModel)
 	default:
 		return false
 	}
 }
 func renderSATCard(id, label, runAction, headerActions, body string) string {
 	actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
 	if strings.TrimSpace(headerActions) != "" {
 		actions += headerActions
 	}
 	return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
 		label, actions, body)
 }
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
 // At metricsCollectInterval = 5 s this covers 30 minutes of live history.
 const metricsChartWindow = 360
 // metricsDownsampleAge is the age after which old metrics rows are downsampled
 // to 1 sample per minute. Data fresher than this is kept at full resolution.
 const metricsDownsampleAge = 2 * time.Hour
 // metricsRetainWindow is the total retention period for metrics rows.
 // Rows older than this are deleted entirely by the background compactor.
 const metricsRetainWindow = 48 * time.Hour
 var metricsCollectInterval = 5 * time.Second
 // pendingNetChange tracks a network state change awaiting confirmation.
@@ -263,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
 	mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
 	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
 	// Tasks
@@ -335,13 +345,24 @@ func (h *handler) startMetricsCollector() {
 	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
-		for range ticker.C {
+		pruneTicker := time.NewTicker(time.Hour)
-			sample := platform.SampleLiveMetrics()
+		defer pruneTicker.Stop()
-			if h.metricsDB != nil {
+		for {
-				_ = h.metricsDB.Write(sample)
+			select {
 			case <-ticker.C:
 				sample := platform.SampleLiveMetrics()
 				if h.metricsDB != nil {
 					_ = h.metricsDB.Write(sample)
 				}
 				h.feedRings(sample)
 				h.setLatestMetric(sample)
 			case <-pruneTicker.C:
 				if h.metricsDB != nil {
 					now := time.Now().UTC()
 					_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
 					_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
 				}
 			}
 			h.feedRings(sample)
 			h.setLatestMetric(sample)
 		}
 	})
 }
@@ -575,12 +596,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	}
 	timeline := metricsTimelineSegments(samples, time.Now())
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
-		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		var overviewOk bool
 		var buf []byte
 		buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
-		if !ok {
+		if !overviewOk {
 			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 			return
 		}
@@ -589,23 +612,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		_, _ = w.Write(buf)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
-	buf, err := renderMetricChartSVG(
+	var buf []byte
-		title,
+	if stacked {
-		labels,
+		buf, err = renderStackedMetricChartSVG(
-		sampleTimes(samples),
+			title,
-		datasets,
+			labels,
-		names,
+			sampleTimes(samples),
-		yMin,
+			datasets,
-		yMax,
+			names,
-		chartCanvasHeightForPath(path, len(names)),
+			yMax,
-		timeline,
+			chartCanvasHeightForPath(path, len(names)),
-	)
+			timeline,
 		)
 	} else {
 		buf, err = renderMetricChartSVG(
 			title,
 			labels,
 			sampleTimes(samples),
 			datasets,
 			names,
 			yMin,
 			yMax,
 			chartCanvasHeightForPath(path, len(names)),
 			timeline,
 		)
 	}
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -615,12 +652,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }
-func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
+func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
-	var datasets [][]float64
+	labels = sampleTimeLabels(samples)
 	var names []string
 	var title string
 	var yMin, yMax *float64
 	labels := sampleTimeLabels(samples)
 	switch {
 	case path == "server-load":
@@ -657,12 +690,19 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case path == "server-power":
 		title = "System Power"
 		power := make([]float64, len(samples))
 		label := "Power W"
 		for i, s := range samples {
 			power[i] = s.PowerW
 			if strings.TrimSpace(s.PowerSource) != "" {
 				label = fmt.Sprintf("Power W · %s", s.PowerSource)
 				if strings.TrimSpace(s.PowerMode) != "" {
 					label += fmt.Sprintf(" (%s)", s.PowerMode)
 				}
 			}
 		}
 		power = normalizePowerSeries(power)
 		datasets = [][]float64{power}
-		names = []string{"Power W"}
+		names = []string{label}
 		yMin = floatPtr(0)
 		yMax = autoMax120(power)
@@ -707,7 +747,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case strings.HasPrefix(path, "gpu/"):
 		idx, sub, ok := parseGPUChartPath(path)
 		if !ok {
-			return nil, nil, nil, "", nil, nil, false
+			return nil, nil, nil, "", nil, nil, false, false
 		}
 		switch sub {
 		case "load":
@@ -715,7 +755,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
 			names = []string{"Load %", "Mem %"}
@@ -725,7 +765,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{temp}
 			names = []string{"Temp °C"}
@@ -735,7 +775,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Core Clock MHz"}
@@ -744,7 +784,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Memory Clock MHz"}
@@ -753,7 +793,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{power}
 			names = []string{"Power W"}
@@ -761,10 +801,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		}
 	default:
-		return nil, nil, nil, "", nil, nil, false
+		return nil, nil, nil, "", nil, nil, false, false
 	}
-	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
+	return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
 }
 func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
@@ -930,6 +970,37 @@ func normalizePowerSeries(ds []float64) []float64 {
 	return out
 }
 // psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
 func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
 	seen := map[int]struct{}{}
 	for _, s := range samples {
 		for _, p := range s.PSUs {
 			seen[p.Slot] = struct{}{}
 		}
 	}
 	slots := make([]int, 0, len(seen))
 	for s := range seen {
 		slots = append(slots, s)
 	}
 	sort.Ints(slots)
 	return slots
 }
 // psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
 func psuStackedTotal(datasets [][]float64) []float64 {
 	if len(datasets) == 0 {
 		return nil
 	}
 	n := len(datasets[0])
 	total := make([]float64, n)
 	for _, ds := range datasets {
 		for i, v := range ds {
 			total[i] += v
 		}
 	}
 	return total
 }
 func normalizeFanSeries(ds []float64) []float64 {
 	if len(ds) == 0 {
 		return nil
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 		},
 	}
-	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 		},
 	}
-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 		},
 	}
-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
@@ -420,6 +420,49 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
 	}
 }
 func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	samples := []platform.LiveMetricSample{
 		{
 			Timestamp: start,
 			PSUs: []platform.PSUReading{
 				{Slot: 1, PowerW: 120},
 				{Slot: 2, PowerW: 130},
 			},
 			PowerW:      250,
 			PowerSource: "sdr_psu_input",
 			PowerMode:   "autotuned",
 		},
 		{
 			Timestamp: start.Add(time.Minute),
 			PSUs: []platform.PSUReading{
 				{Slot: 1, PowerW: 140},
 				{Slot: 2, PowerW: 135},
 			},
 			PowerW:      275,
 			PowerSource: "sdr_psu_input",
 			PowerMode:   "autotuned",
 		},
 	}
 	datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
 	if !ok {
 		t.Fatal("expected server-power chart data")
 	}
 	if title != "System Power" {
 		t.Fatalf("title=%q", title)
 	}
 	if stacked {
 		t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
 	}
 	if len(datasets) != 1 || len(names) != 1 {
 		t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
 	}
 	if names[0] != "Power W · sdr_psu_input (autotuned)" {
 		t.Fatalf("names=%v", names)
 	}
 }
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -650,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 		`/api/gpu/nvidia`,
 		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
 		`/api/bee-bench/nvidia/autotune/run`,
 		`/api/bee-bench/nvidia/autotune/status`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
 		`Autotune`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -744,6 +790,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	}
 }
 func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA Interconnect (NCCL)`,
 		`Validate and Stress:`,
 		`NVIDIA Bandwidth (NVBandwidth)`,
 		`nvbandwidth runs all built-in tests without a time limit`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -7,14 +7,43 @@ import (
 	"time"
 )
 const (
 	recoverLoopMaxDelay   = 60 * time.Second
 	recoverLoopResetAfter = 30 * time.Second
 )
 // goRecoverLoop starts fn in a goroutine, restarting after panics.
 // restartDelay is the initial delay; successive panics double it up to
 // recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
 // successfully for recoverLoopResetAfter without panicking.
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
 		delay := restartDelay
 		consecutive := 0
 		for {
-			if !runRecoverable(name, fn) {
+			start := time.Now()
 			panicked := runRecoverable(name, fn)
 			if !panicked {
 				return
 			}
-			if restartDelay > 0 {
+			consecutive++
-				time.Sleep(restartDelay)
+			if time.Since(start) >= recoverLoopResetAfter {
 				delay = restartDelay
 				consecutive = 1
 			}
 			slog.Warn("goroutine restarting after panic",
 				"component", name,
 				"consecutive_panics", consecutive,
 				"next_delay", delay,
 			)
 			if delay > 0 {
 				time.Sleep(delay)
 			}
 			if delay < recoverLoopMaxDelay {
 				delay *= 2
 				if delay > recoverLoopMaxDelay {
 					delay = recoverLoopMaxDelay
 				}
 			}
 		}
 	}()
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
 		}
 		return gpuDisplayLabel(idx) + " Overview", buf, true
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
 	}
-	buf, err := renderMetricChartSVG(
+	var buf []byte
-		title,
+	var err error
-		labels,
+	if stacked {
-		sampleTimes(samples),
+		buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
-		datasets,
+	} else {
-		names,
+		buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
-		yMin,
+	}
 		yMax,
 		chartCanvasHeightForPath(path, len(names)),
 		timeline,
 	)
 	if err != nil {
 		return "", nil, false
 	}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -34,6 +34,7 @@ var taskNames = map[string]string{
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
 	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
 	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
 	"nvidia-bench-autotune":  "NVIDIA Bee Bench Power Source Autotune",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -125,6 +126,7 @@ type taskParams struct {
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	BenchmarkKind      string   `json:"benchmark_kind,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	RampStep           int      `json:"ramp_step,omitempty"`
@@ -162,6 +164,32 @@ type nvidiaRampSpec struct {
 	TotalDurationSec int
 }
 func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case "overnight":
 		return 1024, 2
 	case "acceptance":
 		return 1024, 1
 	case "smoke":
 		return 256, 1
 	}
 	if stress {
 		return 512, 1
 	}
 	return 256, 1
 }
 func taskMayLeaveOrphanWorkers(target string) bool {
 	switch strings.TrimSpace(strings.ToLower(target)) {
 	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
 		"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
 		return true
 	default:
 		return false
 	}
 }
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -559,6 +587,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	if err := writeTaskReportArtifacts(t); err != nil {
 		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
 	}
 	j.closeLog()
 	if t.ErrMsg != "" {
 		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
 		return
@@ -587,8 +616,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	}
 	a := q.opts.App
 	recovered := len(j.lines) > 0
 	j.append(fmt.Sprintf("Starting %s...", t.Name))
-	if len(j.lines) > 0 {
+	if recovered {
 		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
 	}
@@ -658,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-bench-autotune":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
 			Profile: t.params.BenchmarkProfile,
 			SizeMB:  t.params.SizeMB,
 		}, t.params.BenchmarkKind, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -710,15 +749,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec: dur,
 			Loader:      platform.NvidiaStressLoaderNCCL,
 			GPUIndices:  t.params.GPUIndices,
 		}, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -751,10 +782,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		sizeMB, passes := 256, 1
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
-		if t.params.StressMode {
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
 			sizeMB, passes = 1024, 3
 		}
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
@@ -1010,6 +1039,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			if t.job != nil {
 				t.job.abort()
 			}
 			if taskMayLeaveOrphanWorkers(t.Target) {
 				platform.KillTestWorkers()
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1037,6 +1069,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			if t.job != nil {
 				t.job.abort()
 			}
 			if taskMayLeaveOrphanWorkers(t.Target) {
 				platform.KillTestWorkers()
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1141,10 +1176,13 @@ func (q *taskQueue) loadLocked() {
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
 			// The task was interrupted by a bee-web restart. Child processes
-			// (e.g. bee-gpu-burn-worker) survive the restart in their own
+			// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
-			// process groups and cannot be cancelled retroactively. Mark the
+			// their own process groups. Kill any matching stale workers before
-			// task as failed so the user can decide whether to re-run it
+			// marking the task failed so the next GPU test does not inherit a
-			// rather than blindly re-launching duplicate workers.
+			// busy DCGM slot or duplicate workers.
 			if taskMayLeaveOrphanWorkers(t.Target) {
 				_ = platform.KillTestWorkers()
 			}
 			now := time.Now()
 			t.Status = TaskFailed
 			t.DoneAt = &now
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
 	}
 }
 func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
 	var gotSizeMB, gotPasses int
 	q := &taskQueue{
 		opts: &HandlerOptions{App: &app.App{}},
 	}
 	tk := &Task{
 		ID:        "mem-validate-1",
 		Name:      "Memory SAT",
 		Target:    "memory",
 		Status:    TaskRunning,
 		CreatedAt: time.Now(),
 		params:    taskParams{StressMode: true},
 	}
 	j := &jobState{}
 	orig := runMemoryAcceptancePackCtx
 	runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
 		gotSizeMB = sizeMB
 		gotPasses = passes
 		return "/tmp/memory-validate.tar.gz", nil
 	}
 	defer func() { runMemoryAcceptancePackCtx = orig }()
 	q.runTask(tk, j, context.Background())
 	if gotSizeMB != 512 || gotPasses != 1 {
 		t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
 	}
 }
 func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
--- a/2
+++ b/2
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -110,8 +110,12 @@ nvidia-smi / lspci (audit collection)
 ---
-## What Needs Fixing
+## Fixed Issues
-1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
+All previously open items are resolved:
-2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
+
-3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
+1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
 2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
 3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
 4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
 5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -15,6 +15,41 @@ This applies to:
 - `iso/builder/config/package-lists/*.list.chroot`
 - Any package referenced in bootloader configs, hooks, or overlay scripts
 ## Bootloader sync rule
 The ISO has two independent bootloader configs that must be kept in sync manually:
 | File | Used by |
 |------|---------|
 | `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
 | `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
 live-build does NOT derive one from the other. Any new boot entry, kernel parameter
 change, or new mode added to one file must be manually mirrored in the other.
 **Canonical entry list** (both files must have all of these):
 | Label | Key params |
 |-------|-----------|
 | normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
 | load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
 | GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
 | KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
 | KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
 | fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
 **Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
 ```
 net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
 numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 nowatchdog nosoftlockup
 ```
 (fail-safe is the exception — it deliberately uses minimal params.)
 **Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
 live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
 configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
 ## Memtest rule
 Do not assume live-build's built-in memtest integration is sufficient for `bee`.
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -1,6 +1,7 @@
 DEBIAN_VERSION=12
 DEBIAN_KERNEL_ABI=auto
 NVIDIA_DRIVER_VERSION=590.48.01
 NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
 NCCL_VERSION=2.28.9-1
 NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,8 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --debootstrap-options "--include=ca-certificates" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -35,6 +35,8 @@ typedef void *CUstream;
 #define MAX_STRESS_STREAMS 16
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
 #define MAX_SINGLE_PRECISION_STREAMS 4
 #define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
 static const char *ptx_source =
    ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }
 static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
    }
    return profile_budget_bytes;
 }
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -704,6 +713,19 @@ static const struct profile_desc k_profiles[] = {
 #define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
 static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
    if (!(desc->enabled && cc >= desc->min_cc)) {
        return 0;
    }
    if (precision_filter != NULL) {
        return strcmp(desc->block_label, precision_filter) == 0;
    }
    /* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
     * unstable on the current benchmark fleet and can abort the whole mixed
     * pass after earlier phases already collected useful telemetry. */
    return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
 }
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -908,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
    memset(out, 0, sizeof(*out));
    out->desc = *desc;
    out->stream = stream;
    size_t bytes_per_cell = 0;
    size_t attempt_budget = profile_budget_bytes;
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,106 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }
-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
-    out->m = dim;
+        memset(out, 0, sizeof(*out));
-    out->n = dim;
+        out->desc = *desc;
-    out->k = dim;
+        out->stream = stream;
-    size_t desired_workspace = profile_budget_bytes / 8u;
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
-    if (desired_workspace > 32u * 1024u * 1024u) {
+        out->m = dim;
-        desired_workspace = 32u * 1024u * 1024u;
+        out->n = dim;
-    }
+        out->k = dim;
    desired_workspace = round_down_size(desired_workspace, 256u);
-    size_t a_bytes = 0;
+        size_t desired_workspace = attempt_budget / 8u;
-    size_t b_bytes = 0;
+        if (desired_workspace > 32u * 1024u * 1024u) {
-    size_t c_bytes = 0;
+            desired_workspace = 32u * 1024u * 1024u;
-    size_t d_bytes = 0;
+        }
-    size_t scale_bytes = 0;
+        desired_workspace = round_down_size(desired_workspace, 256u);
    while (1) {
        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+        size_t a_bytes = 0;
-        if (matrix_bytes <= profile_budget_bytes) {
+        size_t b_bytes = 0;
-            size_t remaining = profile_budget_bytes - matrix_bytes;
+        size_t c_bytes = 0;
-            out->workspace_size = desired_workspace;
+        size_t d_bytes = 0;
-            if (out->workspace_size > remaining) {
+        size_t scale_bytes = 0;
-                out->workspace_size = round_down_size(remaining, 256u);
+        while (1) {
            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
            if (matrix_bytes <= attempt_budget) {
                size_t remaining = attempt_budget - matrix_bytes;
                out->workspace_size = desired_workspace;
                if (out->workspace_size > remaining) {
                    out->workspace_size = round_down_size(remaining, 256u);
                }
                break;
            }
-            break;
+
            if (out->m <= (uint64_t)desc->min_multiple) {
                break;
            }
            out->m -= (uint64_t)desc->min_multiple;
            out->n = out->m;
            out->k = out->m;
        }
        if (out->m < (uint64_t)desc->min_multiple) {
            attempt_budget /= 2u;
            continue;
        }
-        if (out->m <= (uint64_t)desc->min_multiple) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-            return 0;
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        }
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        out->m -= (uint64_t)desc->min_multiple;
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
        out->n = out->m;
        out->k = out->m;
    }
    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    cudaDataType_t scale_type = matmul_scale_type(desc);
    if (!check_cublas("cublasLtMatmulDescCreate",
                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    cublasOperation_t transa = CUBLAS_OP_T;
    cublasOperation_t transb = CUBLAS_OP_N;
    if (!check_cublas("set TRANSA",
                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                             CUBLASLT_MATMUL_DESC_TRANSA,
                                                             &transa,
                                                             sizeof(transa))) ||
        !check_cublas("set TRANSB",
                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                             CUBLASLT_MATMUL_DESC_TRANSB,
                                                             &transb,
                                                             sizeof(transb)))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (desc->needs_scalar_scale) {
        float one = 1.0f;
        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+        cudaDataType_t scale_type = matmul_scale_type(desc);
        if (!check_cublas("cublasLtMatmulDescCreate",
                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+        cublasOperation_t transa = CUBLAS_OP_T;
-        if (!check_cublas("set A scale ptr",
+        cublasOperation_t transb = CUBLAS_OP_N;
        if (!check_cublas("set TRANSA",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
-                                                                 &a_scale_ptr,
+                                                                 &transa,
-                                                                 sizeof(a_scale_ptr))) ||
+                                                                 sizeof(transa))) ||
-            !check_cublas("set B scale ptr",
+            !check_cublas("set TRANSB",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
-                                                                 &b_scale_ptr,
+                                                                 &transb,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 sizeof(transb)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-    }
+
        if (desc->needs_scalar_scale) {
            float one = 1.0f;
            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
            if (!check_cublas("set A scale ptr",
                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
                                                                     &a_scale_ptr,
                                                                     sizeof(a_scale_ptr))) ||
                !check_cublas("set B scale ptr",
                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
                                                                     &b_scale_ptr,
                                                                     sizeof(b_scale_ptr)))) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
        }
 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
    if (desc->needs_block_scale) {
@@ -1060,62 +1089,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
    }
 #endif
-    if (!check_cublas("create A layout",
+        if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
+            !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
+            !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
+            !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (out->workspace_size > 0) {
        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        if (out->workspace_size > 0) {
            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
        }
        if (!check_cublas("set workspace",
                          cublas->cublasLtMatmulPreferenceSetAttribute(
                              out->preference,
                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                              &out->workspace_size,
                              sizeof(out->workspace_size)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        int found = 0;
        if (check_cublas("heuristic",
                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
                                                                out->op_desc,
                                                                out->a_layout,
                                                                out->b_layout,
                                                                out->c_layout,
                                                                out->d_layout,
                                                                out->preference,
                                                                1,
                                                                &out->heuristic,
                                                                &found)) &&
            found > 0) {
            out->ready = 1;
            return 1;
        }
        destroy_profile(cublas, cuda, out);
        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
            break;
        }
    }
-    if (!check_cublas("set workspace",
+    return 0;
                      cublas->cublasLtMatmulPreferenceSetAttribute(
                          out->preference,
                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                          &out->workspace_size,
                          sizeof(out->workspace_size)))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    int found = 0;
    if (!check_cublas("heuristic",
                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
                                                             out->op_desc,
                                                             out->a_layout,
                                                             out->b_layout,
                                                             out->c_layout,
                                                             out->d_layout,
                                                             out->preference,
                                                             1,
                                                             &out->heuristic,
                                                             &found))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (found <= 0) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    out->ready = 1;
    return 1;
 }
 static int run_cublas_profile(cublasLtHandle_t handle,
@@ -1180,6 +1212,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
    int budget_profiles = 0;
    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1202,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
            planned++;
        }
    }
@@ -1215,30 +1247,41 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    }
    /* Count all profiles active on this GPU regardless of filter.
-     * Used as the budget divisor so matrix sizes stay consistent whether
+     * Mixed phases still divide budget across the full precision set, while
-     * running all precisions together or a single-precision phase. */
+     * single-precision benchmark phases dedicate budget only to active
     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned_total++;
        }
    }
    if (planned_total < planned) {
        planned_total = planned;
    }
    budget_profiles = planned_total;
    if (precision_filter != NULL) {
        budget_profiles = planned;
    }
    if (budget_profiles <= 0) {
        budget_profiles = planned_total;
    }
    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
    }
    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1251,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
    if (precision_filter != NULL) {
        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  mp_count,
                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));
    for (int i = 0; i < profile_count; i++) {
@@ -1275,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
-        if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
+        if (!profile_allowed_for_run(desc, cc, precision_filter)) {
            append_detail(report->details,
                          sizeof(report->details),
-                          "%s=SKIPPED precision_filter\n",
+                          "%s=SKIPPED benchmark_disabled\n",
                          desc->name);
            continue;
        }
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -126,6 +126,37 @@ resolve_iso_version() {
    resolve_audit_version
 }
 sync_builder_workdir() {
    src_dir="$1"
    dst_dir="$2"
    mkdir -p "$dst_dir"
    # Historical bug: old workdirs could keep config/bootloaders/grub-pc even
    # after the source tree moved to grub-efi only. Remove bootloaders eagerly
    # so reused workdirs cannot leak stale templates into a new ISO build.
    rm -rf "$dst_dir/config/bootloaders"
    rsync -a --delete \
        --exclude='cache/' \
        --exclude='chroot/' \
        --exclude='.build/' \
        --exclude='*.iso' \
        --exclude='*.packages' \
        --exclude='*.contents' \
        --exclude='*.files' \
        "$src_dir/" "$dst_dir/"
    if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
        echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
        exit 1
    fi
    if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
        echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
        exit 1
    fi
 }
 iso_list_files() {
    iso_path="$1"
@@ -203,7 +234,7 @@ dump_memtest_debug() {
        echo "-- source bootloader templates --"
        for cfg in \
-            "${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
+            "${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
            "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
            if [ -f "$cfg" ]; then
                echo "  file: $cfg"
@@ -466,6 +497,75 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }
 validate_iso_live_boot_entries() {
    iso_path="$1"
    echo "=== validating live boot entries in ISO ==="
    [ -f "$iso_path" ] || {
        echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
        exit 1
    }
    require_iso_reader "$iso_path" >/dev/null 2>&1 || {
        echo "ERROR: ISO reader unavailable for live boot validation" >&2
        exit 1
    }
    grub_cfg="$(mktemp)"
    isolinux_cfg="$(mktemp)"
    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
        echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
        echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
        echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    fi
    grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
        echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
        echo "ERROR: GRUB toram entry is missing" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'linux .*boot=live ' "$grub_cfg" || {
        echo "ERROR: GRUB live entry is missing boot=live" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
        echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'append .*boot=live ' "$isolinux_cfg" || {
        echo "ERROR: isolinux live entry is missing boot=live" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
        echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
        rm -f "$grub_cfg" "$isolinux_cfg"
        exit 1
    }
    rm -f "$grub_cfg" "$isolinux_cfg"
    echo "=== live boot validation OK ==="
 }
 validate_iso_nvidia_runtime() {
    iso_path="$1"
    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
@@ -542,6 +642,185 @@ label memtest
 EOF
 }
 extract_live_grub_entry() {
    cfg="$1"
    live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
    live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
    [ -n "$live_linux" ] || return 1
    [ -n "$live_initrd" ] || return 1
    grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
    grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
    grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
    [ -n "$grub_kernel" ] || return 1
    [ -n "$grub_append" ] || return 1
    [ -n "$grub_initrd" ] || return 1
    return 0
 }
 load_live_build_append() {
    lb_dir="$1"
    binary_cfg="$lb_dir/config/binary"
    [ -f "$binary_cfg" ] || return 1
    # config/binary is generated by live-build and contains shell variable
    # assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
    # shellcheck disable=SC1090
    . "$binary_cfg"
    [ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
    live_build_append="$LB_BOOTAPPEND_LIVE"
    return 0
 }
 extract_live_isolinux_entry() {
    cfg="$1"
    isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
    isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
    isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
    [ -n "$isolinux_linux" ] || return 1
    [ -n "$isolinux_initrd" ] || return 1
    [ -n "$isolinux_append" ] || return 1
    isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
    isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
    [ -n "$isolinux_kernel" ] || return 1
    [ -n "$isolinux_initrd_path" ] || return 1
    return 0
 }
 write_canonical_grub_cfg() {
    cfg="$1"
    kernel="$2"
    append_live="$3"
    initrd="$4"
    cat > "$cfg" <<EOF
 source /boot/grub/config.cfg
 echo ""
 echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
 echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
 echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
 echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
    linux   ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }
 menuentry "EASY-BEE -- load to RAM (toram)" {
    linux   ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }
 if [ "\${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
    }
 else
    menuentry "Memory Test (memtest86+)" {
        linux16 /boot/memtest86+x64.bin
    }
 fi
 if [ "\${grub_platform}" = "efi" ]; then
    menuentry "UEFI Firmware Settings" {
        fwsetup
    }
 fi
 EOF
 }
 write_canonical_isolinux_cfg() {
    cfg="$1"
    kernel="$2"
    initrd="$3"
    append_live="$4"
    cat > "$cfg" <<EOF
 label live-@FLAVOUR@-normal
    menu label ^EASY-BEE
    menu default
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^KMS, no nomodeset)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (KMS, ^GSP=off)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux ${kernel}
    initrd ${initrd}
    append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
 label memtest
    menu label ^Memory Test (memtest86+)
    linux /boot/memtest86+x64.bin
 EOF
 }
 enforce_live_build_bootloader_assets() {
    lb_dir="$1"
    grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
    grub_dir="$lb_dir/binary/boot/grub"
    isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
    if ! load_live_build_append "$lb_dir"; then
        echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
        live_build_append=""
    fi
    if [ -f "$grub_cfg" ]; then
        if extract_live_grub_entry "$grub_cfg"; then
            mkdir -p "$grub_dir/live-theme"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
            cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
            echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
        fi
    fi
    if [ -f "$isolinux_cfg" ]; then
        if extract_live_isolinux_entry "$isolinux_cfg"; then
            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
            echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
        fi
    fi
 }
 copy_memtest_from_deb() {
    deb="$1"
    dst_boot="$2"
@@ -932,15 +1211,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
 # Sync builder config into variant work dir, preserving lb cache.
-rsync -a --delete \
+sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
    --exclude='cache/' \
    --exclude='chroot/' \
    --exclude='.build/' \
    --exclude='*.iso' \
    --exclude='*.packages' \
    --exclude='*.contents' \
    --exclude='*.files' \
    "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
 # Share deb package cache across variants.
 # Restore: populate work dir cache from shared cache before build.
@@ -954,86 +1225,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi
 if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
 source /boot/grub/config.cfg
 echo ""
 echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
 echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
 echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
 echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
 echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
 echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
 echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
 submenu "EASY-BEE (advanced options) -->" {
    menuentry "EASY-BEE — KMS (no nomodeset)" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — fail-safe" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
        initrd  @INITRD_LIVE@
    }
 }
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
    }
 else
    menuentry "Memory Test (memtest86+)" {
        linux16 /boot/memtest86+x64.bin
    }
 fi
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "UEFI Firmware Settings" {
        fwsetup
    }
 fi
 EOF
    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
 label live-@FLAVOUR@-normal
    menu label ^EASY-BEE
    menu default
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.display=kms
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ toram
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
 label memtest
    menu label ^Memory Test (memtest86+)
    linux /boot/memtest86+x64.bin
 EOF
 fi
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -1262,6 +1453,7 @@ fi
 # --- substitute version placeholders in package list and archive ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    sed -i \
        -e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
        -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
        "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
@@ -1304,10 +1496,18 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER
 cd "${LB_DIR}"
-run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
+run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
 run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
 dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
 echo "=== enforcing canonical bootloader assets ==="
 enforce_live_build_bootloader_assets "${LB_DIR}"
 reset_live_build_stage "${LB_DIR}" "binary_checksums"
 reset_live_build_stage "${LB_DIR}" "binary_iso"
 reset_live_build_stage "${LB_DIR}" "binary_zsync"
 run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
 run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
 run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
 # --- persist deb package cache back to shared location ---
 # This allows the second variant to reuse all downloaded packages.
@@ -1332,6 +1532,7 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
    validate_iso_live_boot_entries "$ISO_RAW"
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
--- a/iso/builder/config/bootloaders/grub-efi/config.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/config.cfg
--- a/iso/builder/config/bootloaders/grub-efi/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/grub.cfg
@@ -11,27 +11,16 @@ echo "  Hardware Audit LiveCD"
 echo ""
 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }
-submenu "EASY-BEE (advanced options) -->" {
+menuentry "EASY-BEE -- load to RAM (toram)" {
-    menuentry "EASY-BEE — GSP=off" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — KMS (no nomodeset)" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — fail-safe" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
        initrd  @INITRD_LIVE@
    }
 }
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
@@ -5,6 +5,13 @@ title-text: ""
 message-font: "Unifont Regular 16"
 terminal-font: "Unifont Regular 16"
 #bee logo - centered, upper third of screen
 + image {
        top = 4%
        left = 50%-200
        file = "bee-logo.png"
 }
 #help bar at the bottom
 + label {
        top = 100%-50
@@ -21,8 +28,8 @@ terminal-font: "Unifont Regular 16"
 + boot_menu {
        left = 20%
        width = 60%
-        top = 62%
+        top = 65%
-        height = 38%-80
+        height = 35%-80
        item_color = "#c88000"
        item_font = "Unifont Regular 16"
        selected_item_color= "#f5a800"
--- a/iso/builder/config/bootloaders/grub-efi/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/theme.cfg
@@ -1,7 +1,7 @@
 set color_normal=light-gray/black
 set color_highlight=yellow/black
-if [ -e /boot/grub/splash.png ]; then
+if [ -e /boot/grub/live-theme/theme.txt ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
    set menu_color_normal=yellow/black
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
    menu default
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-label live-@FLAVOUR@-kms-gsp-off
+label live-@FLAVOUR@-kms
-    menu label EASY-BEE (g^raphics/KMS, GSP=off)
+    menu label EASY-BEE (^KMS, no nomodeset)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (KMS, ^GSP=off)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
 label memtest
    menu label ^Memory Test (memtest86+)
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
 # Enable GPU-vendor specific services
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    systemctl enable nvidia-dcgm.service 2>/dev/null || true
    systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
    systemctl enable bee-nvidia.service
 elif [ "$GPU_VENDOR" = "amd" ]; then
    # ROCm symlinks (packages install to /opt/rocm-*/bin/)
@@ -62,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
-chmod +x /usr/local/bin/bee-selfheal      2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal        2>/dev/null || true
-chmod +x /usr/local/bin/bee-boot-status  2>/dev/null || true
+chmod +x /usr/local/bin/bee-boot-status    2>/dev/null || true
 chmod +x /usr/local/bin/bee-install        2>/dev/null || true
 chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -1,117 +0,0 @@
 #!/bin/sh
 # 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
 set -e
 echo "=== generating bee wallpaper ==="
 mkdir -p /usr/share/bee
 python3 - <<'PYEOF'
 from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import os
 W, H = 1920, 1080
 ASCII_ART = [
    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
 ]
 SUBTITLE = "  Hardware Audit LiveCD"
 FG = (0xF6, 0xD0, 0x47)
 FG_DIM = (0xD4, 0xA9, 0x1C)
 SHADOW = (0x5E, 0x47, 0x05)
 SUB = (0x96, 0x7A, 0x17)
 BG = (0x05, 0x05, 0x05)
 MONO_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
 ]
 SUB_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
 ]
 def load_font(candidates, size):
    for path in candidates:
        if os.path.exists(path):
            return ImageFont.truetype(path, size)
    return ImageFont.load_default()
 def mono_metrics(font):
    probe = Image.new('L', (W, H), 0)
    draw = ImageDraw.Draw(probe)
    char_w = int(round(draw.textlength("M", font=font)))
    bb = draw.textbbox((0, 0), "Mg", font=font)
    char_h = bb[3] - bb[1]
    return char_w, char_h
 def render_ascii_mask(font, lines, char_w, char_h, line_gap):
    width = max(len(line) for line in lines) * char_w
    height = len(lines) * char_h + line_gap * (len(lines) - 1)
    mask = Image.new('L', (width, height), 0)
    draw = ImageDraw.Draw(mask)
    for row, line in enumerate(lines):
        y = row * (char_h + line_gap)
        for col, ch in enumerate(line):
            if ch == ' ':
                continue
            x = col * char_w
            draw.text((x, y), ch, font=font, fill=255)
    return mask
 img = Image.new('RGB', (W, H), BG)
 draw = ImageDraw.Draw(img)
 # Soft amber glow under the logo without depending on font rendering.
 glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
 glow_draw = ImageDraw.Draw(glow)
 glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
 glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)
 TARGET_LOGO_W = 400
 max_chars = max(len(line) for line in ASCII_ART)
 _probe_font = load_font(MONO_FONT_CANDIDATES, 64)
 _probe_cw, _ = mono_metrics(_probe_font)
 font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
 font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
 char_w, char_h = mono_metrics(font_logo)
 logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
 logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
 logo_y = 380
 sh_off = max(1, font_size_logo // 6)
 shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
 img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
 img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
 img.paste(FG, (logo_x, logo_y), logo_mask)
 font_sub = load_font(SUB_FONT_CANDIDATES, 30)
 sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
 sub_y = logo_y + logo_h + 48
 draw = ImageDraw.Draw(img)
 draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
 draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
 img = img.convert('RGB')
 img.save('/usr/share/bee/wallpaper.png', optimize=True)
 print('wallpaper written: /usr/share/bee/wallpaper.png')
 PYEOF
 echo "=== wallpaper done ==="
--- a/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
+++ b/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
@@ -0,0 +1,46 @@
 #!/bin/sh
 # 9011-toram-rsync.hook.chroot
 #
 # Adds rsync to the initramfs so that live-boot's toram code takes the
 # rsync --progress path instead of the silent "cp -a" fallback.
 #
 # live-boot's 9990-toram-todisk.sh already contains:
 #   if [ -x /bin/rsync ]; then
 #       rsync -a --progress ... 1>/dev/console
 #   else
 #       cp -a ...   # no output
 #   fi
 #
 # We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
 # which copies the binary + all shared-library dependencies into the initrd.
 set -e
 HOOK_DIR="/etc/initramfs-tools/hooks"
 HOOK="${HOOK_DIR}/bee-rsync"
 mkdir -p "${HOOK_DIR}"
 cat > "${HOOK}" << 'EOF'
 #!/bin/sh
 # initramfs hook: include rsync for live-boot toram progress output
 PREREQ=""
 prereqs() { echo "$PREREQ"; }
 case "$1" in prereqs) prereqs; exit 0 ;; esac
 . /usr/share/initramfs-tools/hook-functions
 if [ -x /usr/bin/rsync ]; then
    copy_exec /usr/bin/rsync /bin
 fi
 EOF
 chmod +x "${HOOK}"
 echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
 # Rebuild initramfs so the hook takes effect in the ISO's initrd.img
 KVER=$(ls /lib/modules | sort -V | tail -1)
 echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
 update-initramfs -u -k "${KVER}"
 echo "9011-toram-rsync: done"
--- a/iso/builder/config/package-lists/bee-nvidia.list.chroot
+++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot
@@ -5,6 +5,7 @@
 # DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
 # CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
 # explicitly.
 nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
 datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -3,6 +3,7 @@ dmidecode
 smartmontools
 nvme-cli
 pciutils
 rsync
 ipmitool
 util-linux
 e2fsprogs
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -10,6 +10,8 @@ RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
 # No MemoryMax: bee-web spawns GPU test subprocesses (dcgmproftester etc.)
 # that legitimately use several GB; a cgroup limit kills them via OOM.
 # Keep the web server responsive during GPU/CPU stress (children inherit nice+10
 # via Setpriority in runCmdJob, but the bee-web parent stays at 0).
 Nice=0
--- a/iso/overlay/usr/local/bin/bee-install
+++ b/iso/overlay/usr/local/bin/bee-install
@@ -65,6 +65,9 @@ done
 SQUASHFS="/run/live/medium/live/filesystem.squashfs"
 if [ ! -f "$SQUASHFS" ]; then
    echo "ERROR: squashfs not found at $SQUASHFS" >&2
    echo "  The live medium may have been disconnected." >&2
    echo "  Reconnect the disc and run:  bee-remount-medium --wait" >&2
    echo "  Then re-run bee-install." >&2
    exit 1
 fi
@@ -162,10 +165,59 @@ log "  Mounted."
 log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
 log "  Source: $SQUASHFS"
 log "  Target: $MOUNT_ROOT"
-unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
+
-    grep -E '^\[|^inod|^created|^extract' | \
+# unsquashfs does not support resume, so retry the entire unpack step if the
-    while read -r line; do log "  $line"; done || true
+# source medium disappears mid-copy (e.g. CD physically disconnected).
-log "  Unpack complete."
+UNPACK_ATTEMPTS=0
 UNPACK_MAX=5
 while true; do
    UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
    if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
        die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
    fi
    [ "$UNPACK_ATTEMPTS" -gt 1 ] && log "  Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
    # Re-check squashfs is reachable before each attempt
    if [ ! -f "$SQUASHFS" ]; then
        log "  SOURCE LOST: $SQUASHFS not found."
        log "  Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
        log "  then press Enter here to retry."
        read -r _
        continue
    fi
    # wipe partial unpack so unsquashfs starts clean
    if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
        log "  Cleaning partial unpack from $MOUNT_ROOT ..."
        # keep the mount point itself but remove its contents
        find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
    fi
    UNPACK_OK=0
    unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
        grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
        while IFS= read -r line; do log "  $line"; done || UNPACK_OK=$?
    # Check squashfs is still reachable (gone = disc pulled during copy)
    if [ ! -f "$SQUASHFS" ]; then
        log "  WARNING: source medium lost during unpack — will retry after remount."
        log "  Run 'bee-remount-medium --wait' in another terminal, then press Enter."
        read -r _
        continue
    fi
    # Verify the unpack produced a usable root (presence of /etc is a basic check)
    if [ -d "${MOUNT_ROOT}/etc" ]; then
        log "  Unpack complete."
        break
    else
        log "  WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
        if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
            log "  Retrying in 5 s ..."
            sleep 5
        fi
    fi
 done
 # ------------------------------------------------------------------
 log "--- Step 6/7: Configuring installed system ---"
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -258,6 +258,22 @@ else
    log "WARN: nvidia-smi not found — cannot enable persistence mode"
 fi
 # Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
 # systems CUDA/DCGM can report "system not yet initialized" until fabric
 # training completes under nvidia-fabricmanager.
 if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
    if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
        log "nvidia-fabricmanager restarted"
    elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
        log "nvidia-fabricmanager started"
    else
        log "WARN: failed to start nvidia-fabricmanager.service"
        systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/  fabricmanager: /' || true
    fi
 else
    log "WARN: nvidia-fabricmanager.service not installed"
 fi
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
 # If it started too early (for example via systemd before bee-nvidia-load), it can
--- a/iso/overlay/usr/local/bin/bee-nvidia-recover
+++ b/iso/overlay/usr/local/bin/bee-nvidia-recover
@@ -0,0 +1,178 @@
 #!/bin/sh
 # bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
 set -u
 log() {
    echo "[bee-nvidia-recover] $*"
 }
 log_blocker() {
    echo "[bee-nvidia-recover] blocker: $*"
 }
 usage() {
    cat <<'EOF'
 usage:
  bee-nvidia-recover restart-drivers
  bee-nvidia-recover reset-gpu <index>
 EOF
 }
 unit_exists() {
    systemctl cat "$1" >/dev/null 2>&1
 }
 unit_is_active() {
    systemctl is-active --quiet "$1" 2>/dev/null
 }
 stop_unit_if_active() {
    unit="$1"
    if unit_is_active "$unit"; then
        log "stopping $unit"
        systemctl stop "$unit"
        return 0
    fi
    return 1
 }
 start_unit_if_marked() {
    unit="$1"
    marker="$2"
    if [ "$marker" = "1" ] && unit_exists "$unit"; then
        log "starting $unit"
        systemctl start "$unit"
    fi
 }
 wait_for_process_exit() {
    name="$1"
    tries=0
    while pgrep -x "$name" >/dev/null 2>&1; do
        tries=$((tries + 1))
        if [ "$tries" -ge 15 ]; then
            log "WARN: $name is still running after stop request"
            return 1
        fi
        sleep 1
    done
    return 0
 }
 kill_pattern() {
    pattern="$1"
    if pgrep -f "$pattern" >/dev/null 2>&1; then
        pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
            [ -n "$line" ] || continue
            log_blocker "$line"
        done
        log "killing processes matching: $pattern"
        pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
        sleep 1
        pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
    fi
 }
 drain_gpu_clients() {
    display_was_active=0
    fabric_was_active=0
    for unit in display-manager.service lightdm.service; do
        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
            log_blocker "service $unit"
            display_was_active=1
        fi
    done
    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
        log_blocker "service nvidia-fabricmanager.service"
        fabric_was_active=1
    fi
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
            [ -n "$line" ] || continue
            log_blocker "$line"
        done
        log "stopping nv-hostengine"
        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
    fi
    for pattern in \
        "nvidia-smi" \
        "dcgmi" \
        "nvvs" \
        "dcgmproftester" \
        "all_reduce_perf" \
        "nvtop" \
        "bee-gpu-burn" \
        "bee-john-gpu-stress" \
        "bee-nccl-gpu-stress" \
        "Xorg" \
        "Xwayland"; do
        kill_pattern "$pattern"
    done
 }
 restore_gpu_clients() {
    if command -v nvidia-smi >/dev/null 2>&1; then
        if nvidia-smi -pm 1 >/dev/null 2>&1; then
            log "enabled NVIDIA persistence mode"
        else
            log "WARN: failed to enable NVIDIA persistence mode"
        fi
    fi
    if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        log "starting nv-hostengine"
        nv-hostengine
    fi
    start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
    start_unit_if_marked display-manager.service "${display_was_active:-0}"
    if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
        start_unit_if_marked lightdm.service "1"
    fi
 }
 restart_drivers() {
    drain_gpu_clients
    for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
        if lsmod | awk '{print $1}' | grep -qx "$mod"; then
            log "unloading module $mod"
            rmmod "$mod"
        fi
    done
    rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
    log "reloading NVIDIA driver stack"
    /usr/local/bin/bee-nvidia-load
    restore_gpu_clients
 }
 reset_gpu() {
    index="$1"
    drain_gpu_clients
    log "resetting GPU $index"
    nvidia-smi -r -i "$index"
    restore_gpu_clients
 }
 cmd="${1:-}"
 case "$cmd" in
    restart-drivers)
        restart_drivers
        ;;
    reset-gpu)
        if [ "$#" -ne 2 ]; then
            usage >&2
            exit 2
        fi
        reset_gpu "$2"
        ;;
    *)
        usage >&2
        exit 2
        ;;
 esac
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -9,9 +9,9 @@ xset s noblank
 # Set desktop background.
 if [ -f /usr/share/bee/wallpaper.png ]; then
-    feh --bg-fill /usr/share/bee/wallpaper.png
+    feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
 else
-    xsetroot -solid '#f6c90e'
+    xsetroot -solid '#000000'
 fi
 tint2 &
--- a/iso/overlay/usr/local/bin/bee-remount-medium
+++ b/iso/overlay/usr/local/bin/bee-remount-medium
@@ -0,0 +1,100 @@
 #!/bin/bash
 # bee-remount-medium — find and remount the live ISO medium to /run/live/medium
 #
 # Run this after reconnecting the ISO source disc (USB/CD) if the live medium
 # was lost and /run/live/medium/live/filesystem.squashfs is missing.
 #
 # Usage: bee-remount-medium [--wait]
 #   --wait  keep retrying every 5 seconds until the medium is found (useful
 #           while physically reconnecting the device)
 set -euo pipefail
 MEDIUM_DIR="/run/live/medium"
 SQUASHFS_REL="live/filesystem.squashfs"
 WAIT_MODE=0
 for arg in "$@"; do
    case "$arg" in
        --wait|-w) WAIT_MODE=1 ;;
        --help|-h)
            echo "Usage: bee-remount-medium [--wait]"
            echo "  Finds and remounts the live ISO medium to $MEDIUM_DIR"
            echo "  --wait  retry every 5 s until a medium with squashfs is found"
            exit 0 ;;
    esac
 done
 log() { echo "[$(date +%H:%M:%S)] $*"; }
 die() { log "ERROR: $*" >&2; exit 1; }
 # Return all candidate block devices (optical + removable USB mass storage)
 find_candidates() {
    # CD/DVD drives
    for dev in /dev/sr* /dev/scd*; do
        [ -b "$dev" ] && echo "$dev"
    done
    # USB/removable disks and partitions
    for dev in /dev/sd* /dev/vd*; do
        [ -b "$dev" ] || continue
        # Only whole disks or partitions — skip the same device we are running from
        local removable
        local base
        base=$(basename "$dev")
        removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
        [ "$removable" = "1" ] && echo "$dev"
    done
 }
 # Try to mount $1 to $MEDIUM_DIR and check for squashfs
 try_mount() {
    local dev="$1"
    local tmpdir
    tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
    if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
        if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
            # Unmount probe mount and mount properly onto live path
            umount "$tmpdir" 2>/dev/null || true
            rmdir "$tmpdir"  2>/dev/null || true
            # Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
            umount "$MEDIUM_DIR" 2>/dev/null || true
            mkdir -p "$MEDIUM_DIR"
            if mount -o ro "$dev" "$MEDIUM_DIR"; then
                log "Mounted $dev on $MEDIUM_DIR"
                return 0
            else
                log "Mount of $dev on $MEDIUM_DIR failed"
                return 1
            fi
        fi
        umount "$tmpdir" 2>/dev/null || true
    fi
    rmdir "$tmpdir" 2>/dev/null || true
    return 1
 }
 attempt() {
    log "Scanning for ISO medium..."
    for dev in $(find_candidates); do
        log "  Trying $dev ..."
        if try_mount "$dev"; then
            local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
            log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
            return 0
        fi
    done
    return 1
 }
 if [ "$WAIT_MODE" = "1" ]; then
    log "Waiting for live medium (press Ctrl+C to abort)..."
    while true; do
        if attempt; then
            exit 0
        fi
        log "  Not found — retrying in 5 s (reconnect the disc now)"
        sleep 5
    done
 else
    attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
 fi
--- a/iso/overlay/usr/share/bee/wallpaper.png
+++ b/iso/overlay/usr/share/bee/wallpaper.png
--- a/scripts/deploy.sh
+++ b/scripts/deploy.sh
@@ -0,0 +1,64 @@
 #!/usr/bin/env bash
 set -euo pipefail
 REMOTE_USER="bee"
 REMOTE_BIN="/usr/local/bin/bee"
 LOCAL_BIN="audit/bee"
 SERVICES="bee-audit bee-web"
 # --- IP ---
 if [[ $# -ge 1 ]]; then
    HOST="$1"
 else
    read -rp "IP адрес хоста: " HOST
 fi
 [[ -z "$HOST" ]] && { echo "Ошибка: IP не указан"; exit 1; }
 # --- SSH options ---
 SSH_OPTS=(-o StrictHostKeyChecking=no -o ConnectTimeout=10)
 # Проверяем, нужен ли пароль
 SSH_PASS=""
 if ! ssh "${SSH_OPTS[@]}" -o BatchMode=yes "${REMOTE_USER}@${HOST}" true 2>/dev/null; then
    if command -v sshpass &>/dev/null; then
        read -rsp "Пароль для ${REMOTE_USER}@${HOST}: " SSH_PASS
        echo
        SSH_CMD=(sshpass -p "$SSH_PASS" ssh "${SSH_OPTS[@]}")
        SCP_CMD=(sshpass -p "$SSH_PASS" scp "${SSH_OPTS[@]}")
    else
        echo "sshpass не установлен. Введите пароль вручную при запросе (или установите SSH-ключ)."
        SSH_CMD=(ssh "${SSH_OPTS[@]}")
        SCP_CMD=(scp "${SSH_OPTS[@]}")
    fi
 else
    SSH_CMD=(ssh "${SSH_OPTS[@]}")
    SCP_CMD=(scp "${SSH_OPTS[@]}")
 fi
 REMOTE="${REMOTE_USER}@${HOST}"
 # --- Build ---
 echo "==> Сборка бинарника..."
 (
    cd audit
    VERSION=$(sh ./scripts/resolve-version.sh 2>/dev/null || echo "dev")
    CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
        go build -ldflags "-X main.Version=${VERSION}" -o bee ./cmd/bee
 )
 echo "    OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
 # --- Deploy ---
 echo "==> Копирование на ${REMOTE}..."
 "${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
 echo "==> Замена бинарника и перезапуск сервисов..."
 "${SSH_CMD[@]}" "$REMOTE" bash -s <<EOF
 set -e
 sudo mv /tmp/bee-new ${REMOTE_BIN}
 sudo chmod +x ${REMOTE_BIN}
 sudo systemctl restart ${SERVICES}
 sleep 2
 systemctl status ${SERVICES} --no-pager -l
 EOF
 echo "==> Готово."