Seed power ramp with single-card TDP limits

Move NCCL and NVBandwidth into validate mode
Use static overlay wallpaper in ISO build
2026-04-16 11:43:01 +03:00 · 2026-04-16 11:02:30 +03:00 · 2026-04-16 10:54:03 +03:00 · 2026-04-16 10:10:18 +03:00 · 2026-04-16 10:00:03 +03:00 · 2026-04-16 09:58:02 +03:00
44 changed files with 4916 additions and 1421 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 .DS_Store
 dist/
 iso/out/
+build-cache/
--- a/audit/go.mod
+++ b/audit/go.mod
@@ -5,22 +5,18 @@ go 1.25.0
 replace reanimator/chart => ../internal/chart

 require (
-	github.com/go-analyze/charts v0.5.26
+	modernc.org/sqlite v1.48.0
 	reanimator/chart v0.0.0-00010101000000-000000000000
 )

 require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
-	github.com/go-analyze/bulk v0.1.3 // indirect
-	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/ncruces/go-strftime v1.0.0 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
-	golang.org/x/image v0.24.0 // indirect
 	golang.org/x/sys v0.42.0 // indirect
-	modernc.org/libc v1.70.0 // indirect
+	modernc.org/libc v1.72.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
-	modernc.org/sqlite v1.48.0 // indirect
 )
--- a/audit/go.sum
+++ b/audit/go.sum
@@ -1,37 +1,51 @@
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
-github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
-github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
-github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
-github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
 github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
-github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
-github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
-golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
-golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
 golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
-modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
+modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
+modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
+modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
+modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
+modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
+modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
+modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
+modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
+modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
+modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
+modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
+modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
+modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
+modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
 modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
 modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
 modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
 modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
+modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
+modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
+modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
 modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
 modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
+modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
+modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
+modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
+modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -30,7 +30,9 @@ var (
 	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
 	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
 	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
-	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
+	DefaultBeeBenchBaseDir  = DefaultExportDir + "/bee-bench"
+	DefaultBeeBenchPerfDir  = DefaultBeeBenchBaseDir + "/perf"
+	DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
 )

 type App struct {
@@ -84,6 +86,7 @@ type installer interface {
 	InstallToDisk(ctx context.Context, device string, logFile string) error
 	IsLiveMediaInRAM() bool
 	LiveBootSource() platform.LiveBootSource
+	LiveMediaRAMState() platform.LiveMediaRAMState
 	RunInstallToRAM(ctx context.Context, logFunc func(string)) error
 }

@@ -108,6 +111,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
 	return a.installer.LiveBootSource()
 }

+func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
+	return a.installer.LiveMediaRAMState()
+}
+
 func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 	return a.installer.RunInstallToRAM(ctx, logFunc)
 }
@@ -117,6 +124,7 @@ type satRunner interface {
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
+	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -138,7 +146,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }

 type runtimeChecker interface {
@@ -562,11 +570,18 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp

 func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
-		baseDir = DefaultBenchmarkBaseDir
+		baseDir = DefaultBeeBenchPerfDir
 	}
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBeeBenchPowerDir
+	}
+	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
+}
+
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -729,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
+}
+
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -122,11 +122,13 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
+	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -154,6 +156,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
 	return f.runNvidiaFn(baseDir)
 }

+func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
+	if f.runNvidiaPowerBenchFn != nil {
+		return f.runNvidiaPowerBenchFn(baseDir, opts)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -279,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }

-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNCCLFn != nil {
+		return f.runNCCLFn(baseDir, gpuIndices)
+	}
 	return "", nil
 }

+func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
+	t.Parallel()
+
+	var gotBaseDir string
+	var gotGPUIndices []int
+	a := &App{
+		sat: fakeSAT{
+			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
+				gotBaseDir = baseDir
+				gotGPUIndices = append([]int(nil), gpuIndices...)
+				return "/tmp/nccl-tests.tar.gz", nil
+			},
+		},
+	}
+
+	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
+	if err != nil {
+		t.Fatalf("RunNCCLTests error: %v", err)
+	}
+	if path != "/tmp/nccl-tests.tar.gz" {
+		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
+	}
+	if gotBaseDir != "/tmp/sat" {
+		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
+	}
+	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
+		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
+	}
+}
+
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -22,6 +22,8 @@ var supportBundleServices = []string{
 	"bee-selfheal.service",
 	"bee-selfheal.timer",
 	"bee-sshsetup.service",
+	"nvidia-dcgm.service",
+	"nvidia-fabricmanager.service",
 }

 var supportBundleCommands = []struct {
@@ -48,6 +50,43 @@ else
 fi
 `}},
 	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
+if command -v nvidia-smi >/dev/null 2>&1; then
+  nvidia-smi topo -m 2>&1 || true
+else
+  echo "nvidia-smi not found"
+fi
+`}},
+	{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
+if ! command -v systemctl >/dev/null 2>&1; then
+  echo "systemctl not found"
+  exit 0
+fi
+echo "=== unit files ==="
+systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== active units ==="
+systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
+echo
+echo "=== failed units ==="
+systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
+`}},
+	{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
+for candidate in \
+  /usr/bin/nvidia-fabricmanager \
+  /usr/bin/nv-fabricmanager \
+  /usr/bin/nvidia-fabricmanagerd \
+  /usr/bin/nvlsm; do
+  if [ -e "$candidate" ]; then
+    echo "=== $candidate ==="
+    ls -l "$candidate" 2>&1 || true
+    echo
+  fi
+done
+if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
+  echo "no fabric manager binaries found"
+fi
+`}},
 	{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
 if ! command -v lspci >/dev/null 2>&1; then
  echo "lspci not found"
@@ -195,6 +234,10 @@ var supportBundleOptionalFiles = []struct {
 }{
 	{name: "system/kern.log", src: "/var/log/kern.log"},
 	{name: "system/syslog.txt", src: "/var/log/syslog"},
+	{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
+	{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
+	{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
+	{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
 }

 const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -2,25 +2,15 @@ package platform

 import (
 	"fmt"
-	"os"
-	"path/filepath"
-	"regexp"
 	"strings"
 	"time"
 )

 func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
-	return renderBenchmarkReportWithCharts(result, nil)
+	return renderBenchmarkReportWithCharts(result)
 }

-type benchmarkReportChart struct {
-	Title   string
-	Content string
-}
-
-var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
-
-func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
+func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	var b strings.Builder

 	// ── Header ────────────────────────────────────────────────────────────────
@@ -58,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
 	}
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	if result.RampStep > 0 && result.RampTotal > 0 {
 		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
@@ -71,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
+	if result.PlatformPowerScore > 0 {
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")

@@ -91,34 +84,139 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 		b.WriteString("\n")
 	}

-	// ── Scorecard table ───────────────────────────────────────────────────────
-	b.WriteString("## Scorecard\n\n")
-	b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
-	b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
+	// ── Balanced Scorecard ────────────────────────────────────────────────────
+	b.WriteString("## Balanced Scorecard\n\n")
+
+	// Perspective 1: Compatibility — hard stops
+	b.WriteString("### 1. Compatibility\n\n")
+	b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
+	b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
 	for _, gpu := range result.GPUs {
-		name := strings.TrimSpace(gpu.Name)
-		if name == "" {
-			name = "Unknown GPU"
+		thermalThrottle := "-"
+		if gpu.Scores.ThermalThrottlePct > 0 {
+			thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
 		}
-		interconnect := "-"
-		if gpu.Scores.InterconnectScore > 0 {
-			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
+		fanAtThrottle := "-"
+		if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
+			fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+		}
+		ecc := "-"
+		if gpu.ECC.Uncorrected > 0 {
+			ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
+		}
+		compatStatus := "✓ OK"
+		if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
+			compatStatus = "⛔ HARD STOP"
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
+			gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
+	}
+	b.WriteString("\n")
+
+	// Perspective 2: Thermal headroom
+	b.WriteString("### 2. Thermal Headroom\n\n")
+	b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
+	b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
+	for _, gpu := range result.GPUs {
+		shutdownTemp := gpu.ShutdownTempC
+		if shutdownTemp <= 0 {
+			shutdownTemp = 90
+		}
+		slowdownTemp := gpu.SlowdownTempC
+		if slowdownTemp <= 0 {
+			slowdownTemp = 80
+		}
+		headroom := gpu.Scores.TempHeadroomC
+		thermalStatus := "✓ OK"
+		switch {
+		case headroom < 10:
+			thermalStatus = "⛔ CRITICAL"
+		case gpu.Steady.P95TempC >= slowdownTemp:
+			thermalStatus = "⚠ WARNING"
+		}
+		throttlePct := "-"
+		if gpu.Scores.ThermalThrottlePct > 0 {
+			throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+		}
+		fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
+			gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
+	}
+	b.WriteString("\n")
+
+	// Perspective 3: Power delivery
+	b.WriteString("### 3. Power Delivery\n\n")
+	b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
+	b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
+	for _, gpu := range result.GPUs {
+		powerCap := "-"
+		if gpu.Scores.PowerCapThrottlePct > 0 {
+			powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
+		}
+		fanDuty := "-"
+		if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
+			fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+		}
+		powerStatus := "✓ OK"
+		if gpu.Scores.PowerCapThrottlePct > 5 {
+			powerStatus = "⚠ POWER LIMITED"
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
+			gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
+	}
+	b.WriteString("\n")
+
+	// Perspective 4: Performance
+	b.WriteString("### 4. Performance\n\n")
+	b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
+	b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
+	for _, gpu := range result.GPUs {
+		synthetic := "-"
+		if gpu.Scores.SyntheticScore > 0 {
+			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+		}
+		mixed := "-"
+		if gpu.Scores.MixedScore > 0 {
+			mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+		}
+		mixedEff := "-"
+		if gpu.Scores.MixedEfficiency > 0 {
+			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
 		}
 		topsPerSM := "-"
 		if gpu.Scores.TOPSPerSMPerGHz > 0 {
 			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
 		}
-		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
-			gpu.Index, name,
-			gpu.Status,
-			gpu.Scores.CompositeScore,
-			gpu.Scores.ComputeScore,
-			topsPerSM,
-			gpu.Scores.PowerSustainScore,
-			gpu.Scores.ThermalSustainScore,
-			gpu.Scores.StabilityScore,
-			interconnect,
-		)
+		fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
+			gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
+	}
+	if len(result.PerformanceRampSteps) > 0 {
+		fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
+	}
+	b.WriteString("\n")
+
+	// Perspective 5: Anomaly flags
+	b.WriteString("### 5. Anomalies\n\n")
+	b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
+	b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
+	for _, gpu := range result.GPUs {
+		eccCorr := "-"
+		if gpu.ECC.Corrected > 0 {
+			eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
+		}
+		syncBoost := "-"
+		if gpu.Scores.SyncBoostThrottlePct > 0 {
+			syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
+		}
+		powerVar := "OK"
+		if gpu.Scores.PowerSustainScore < 70 {
+			powerVar = "⚠ unstable"
+		}
+		thermalVar := "OK"
+		if gpu.Scores.ThermalSustainScore < 70 {
+			thermalVar = "⚠ unstable"
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
+			gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
 	}
 	b.WriteString("\n")

@@ -147,20 +245,66 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 		if gpu.PowerLimitW > 0 {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
+		if gpu.PowerLimitDerated {
+			fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
+		}
+		if gpu.CalibratedPeakPowerW > 0 {
+			if gpu.CalibratedPeakTempC > 0 {
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
+			} else {
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
+			}
+		}
 		if gpu.LockedGraphicsClockMHz > 0 {
 			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
 		}
 		b.WriteString("\n")

 		// Steady-state telemetry
-		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
-		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
-		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
-		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
-		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
-		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
-		b.WriteString("\n")
+		if benchmarkTelemetryAvailable(gpu.Steady) {
+			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+			b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+			fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+			fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+			fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+			fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+			fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+			b.WriteString("\n")
+		} else {
+			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
+		}
+
+		// Per-precision stability phases.
+		if len(gpu.PrecisionSteady) > 0 {
+			b.WriteString("**Per-precision stability:**\n\n")
+			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
+			for _, p := range gpu.PrecisionSteady {
+				eccCorr := "—"
+				eccUncorr := "—"
+				if !p.ECC.IsZero() {
+					eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
+					eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
+				}
+				status := p.Status
+				if strings.TrimSpace(status) == "" {
+					status = "OK"
+				}
+				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
+					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
+					eccCorr, eccUncorr)
+			}
+			b.WriteString("\n")
+		} else {
+			// Legacy: show combined-window variance.
+			fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
+				gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
+		}
+
+		// ECC summary
+		if !gpu.ECC.IsZero() {
+			fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
+				gpu.ECC.Corrected, gpu.ECC.Uncorrected)
+		}

 		// Throttle
 		throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
@@ -171,12 +315,14 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 		// Precision results
 		if len(gpu.PrecisionResults) > 0 {
 			b.WriteString("**Precision results:**\n\n")
-			b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
+			b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
 			for _, p := range gpu.PrecisionResults {
 				if p.Supported {
-					fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
+					weightStr := fmt.Sprintf("×%.3g", p.Weight)
+					fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
+						p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
 				} else {
-					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
+					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
 				}
 			}
 			b.WriteString("\n")
@@ -237,61 +383,54 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 		}
 	}

-	// ── Terminal charts (steady-state only) ───────────────────────────────────
-	if len(charts) > 0 {
-		b.WriteString("## Steady-State Charts\n\n")
-		for _, chart := range charts {
-			content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
-			if content == "" {
-				continue
+	// ── Cooling ───────────────────────────────────────────────────────────────
+	if cooling := result.Cooling; cooling != nil {
+		b.WriteString("## Cooling\n\n")
+		if cooling.Available {
+			b.WriteString("| Metric | Value |\n|--------|-------|\n")
+			fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
+			if cooling.FanDutyCycleAvailable {
+				fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
+				fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
+			} else {
+				b.WriteString("| Average fan duty cycle | N/A |\n")
+				b.WriteString("| P95 fan duty cycle | N/A |\n")
 			}
-			fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
+			b.WriteString("\n")
+		} else {
+			b.WriteString("Cooling telemetry unavailable.\n\n")
+		}
+		for _, note := range cooling.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		if len(cooling.Notes) > 0 {
+			b.WriteString("\n")
 		}
 	}

-	// ── Methodology ───────────────────────────────────────────────────────────
-	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
-	b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
-	b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
-	b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
+	// ── Platform Scalability ──────────────────────────────────────────────────
+	if len(result.PerformanceRampSteps) > 0 {
+		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
+		b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
+		b.WriteString("|--------|-------------|----------------------|-------------|\n")
+		for _, step := range result.PerformanceRampSteps {
+			fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
+				step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
+		}
+		b.WriteString("\n")
+	}

 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
-	b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
-	b.WriteString("- `gpu-*-warmup.log`\n")
-	b.WriteString("- `gpu-*-steady.log`\n")
-	b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
-	b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
+	b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
 	if result.Interconnect != nil {
 		b.WriteString("- `nccl-all-reduce.log`\n")
 	}
 	return b.String()
 }

-// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
-// cooldown charts are not useful for human review).
-func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
-	var charts []benchmarkReportChart
-	for _, idx := range gpuIndices {
-		path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
-		raw, err := os.ReadFile(path)
-		if err != nil || len(raw) == 0 {
-			continue
-		}
-		charts = append(charts, benchmarkReportChart{
-			Title:   fmt.Sprintf("GPU %d — Steady State", idx),
-			Content: string(raw),
-		})
-	}
-	return charts
-}
-
-func stripANSIEscapeSequences(raw string) string {
-	return ansiEscapePattern.ReplaceAllString(raw, "")
-}
-
 // formatThrottleLine renders throttle counters as human-readable percentages of
 // the steady-state window.  Only non-zero counters are shown.  When the steady
 // duration is unknown (0), raw seconds are shown instead.
@@ -331,6 +470,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
 		{
 			name:    "default",
 			profile: "",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
 		},
 		{
 			name:    "stability",
 			profile: "stability",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
 		},
 		{
 			name:    "overnight",
 			profile: "overnight",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
 		},
 	}

@@ -41,6 +41,129 @@ func TestResolveBenchmarkProfile(t *testing.T) {
 	}
 }

+func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
+	t.Parallel()
+
+	labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
+		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
+		benchmarkPrecisionPhases,
+		func(label string) string { return label },
+	)
+	if len(labels) != 5 || len(phases) != 5 {
+		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
+	}
+	if basePhaseSec != 60 {
+		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
+	}
+	if mixedPhaseSec != 300 {
+		t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
+	}
+	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
+		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
+	}
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
+		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
+	}
+}
+
+func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
+	t.Parallel()
+
+	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
+		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
+		benchmarkPrecisionPhases,
+		func(label string) string { return label },
+	)
+	if basePhaseSec != 300 {
+		t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
+	}
+	if mixedPhaseSec != 3600 {
+		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
+	}
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
+		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
+	}
+}
+
+func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
+	t.Parallel()
+
+	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
+		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
+		benchmarkPrecisionPhases,
+		func(label string) string { return label },
+	)
+	if basePhaseSec != 3600 {
+		t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
+	}
+	if mixedPhaseSec != 14400 {
+		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
+	}
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
+		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
+	}
+}
+
+func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
+	t.Parallel()
+
+	phases := []benchmarkPlannedPhase{
+		{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
+		{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
+		{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
+	}
+	rows := []GPUMetricRow{
+		{ElapsedSec: 5},
+		{ElapsedSec: 15},
+		{ElapsedSec: 25},
+		{ElapsedSec: 65},
+	}
+	got := splitBenchmarkRowsByPlannedPhase(rows, phases)
+	if len(got["fp8"]) != 1 {
+		t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
+	}
+	if len(got["fp16"]) != 1 {
+		t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
+	}
+	if len(got["mixed"]) != 2 {
+		t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
+	}
+}
+
+func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
+	t.Parallel()
+
+	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
+		t.Fatalf("supported=%v", got)
+	}
+	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
+		t.Fatalf("supported=%v", got)
+	}
+}
+
+func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name       string
+		raw        string
+		wantStatus string
+	}{
+		{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
+		{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
+		{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
+	}
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
+			if got != tc.wantStatus {
+				t.Fatalf("status=%q want %q", got, tc.wantStatus)
+			}
+		})
+	}
+}
+
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()

@@ -65,8 +188,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
 		"[gpu 0] compute_capability=9.0",
 		"[gpu 0] backend=cublasLt",
 		"[gpu 0] duration_s=10",
+		"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
 		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
 		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
+		"[gpu 0] int8_tensor_iterations=80",
 		"[gpu 0] fp16_tensor_iterations=200",
 		"[gpu 0] fp8_e4m3_iterations=50",
 		"[gpu 0] status=OK",
@@ -79,15 +204,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
 	if got.ComputeCapability != "9.0" {
 		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
 	}
-	if len(got.Profiles) != 2 {
-		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+	if len(got.Profiles) != 3 {
+		t.Fatalf("profiles=%d want 3", len(got.Profiles))
 	}
 	if got.Profiles[0].TeraOpsPerSec <= 0 {
 		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
 	}
+	if got.Profiles[0].Category != "fp16_bf16" {
+		t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
+	}
 	if got.Profiles[1].Category != "fp8" {
 		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
 	}
+	if got.Profiles[2].Category != "int8" {
+		t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
+	}
+	if got.Profiles[2].Weight != 0.25 {
+		t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
+	}
 }

 func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
@@ -131,6 +265,13 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 				DegradationReasons: []string{"power_capped"},
 			},
 		},
+		Cooling: &BenchmarkCoolingSummary{
+			Available:             true,
+			AvgFanRPM:             9200,
+			FanDutyCycleAvailable: true,
+			AvgFanDutyCyclePct:    47.5,
+			P95FanDutyCyclePct:    62.0,
+		},
 	}

 	report := renderBenchmarkReport(result)
@@ -140,6 +281,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 		"1176.00",
 		"fp16_tensor",
 		"700.00",
+		"Cooling",
+		"Average fan duty cycle",
+		"47.5%",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
@@ -147,35 +291,50 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
 	}
 }

-func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
+func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	t.Parallel()

-	report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
+	report := renderBenchmarkReport(NvidiaBenchmarkResult{
 		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
 		OverallStatus:      "OK",
 		SelectedGPUIndices: []int{0},
 		Normalization: BenchmarkNormalization{
 			Status: "full",
 		},
-	}, []benchmarkReportChart{
-		{
-			Title:   "GPU 0 Steady State",
-			Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
-		},
 	})

 	for _, needle := range []string{
-		"Steady-State Charts",
-		"GPU 0 Steady State",
-		"GPU 0 chart",
-		"42┤───",
+		"gpu-metrics.csv",
+		"gpu-metrics.html",
+		"gpu-burn.log",
 	} {
 		if !strings.Contains(report, needle) {
 			t.Fatalf("report missing %q\n%s", needle, report)
 		}
 	}
-	if strings.Contains(report, "\x1b[31m") {
-		t.Fatalf("report should not contain ANSI escapes\n%s", report)
+}
+
+func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
+	t.Parallel()
+
+	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
+		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
+			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
+			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
+			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
+		},
+		PrecisionResults: []BenchmarkPrecisionResult{
+			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
+			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
+			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
+		},
+	})
+
+	if score.SyntheticScore != 100 {
+		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
+	}
+	if score.MixedScore != 50 {
+		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
 	}
 }

--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -25,6 +25,18 @@ type BenchmarkCPULoad struct {
 	Note   string `json:"note,omitempty"`
 }

+// BenchmarkCoolingSummary captures fan telemetry averaged across the full
+// benchmark run.
+type BenchmarkCoolingSummary struct {
+	Available             bool     `json:"available"`
+	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
+	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
+	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
+	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
+	Notes                 []string `json:"notes,omitempty"`
+}
+
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
@@ -43,28 +55,33 @@ type NvidiaBenchmarkOptions struct {
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }

-
 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
-	Findings           []string                     `json:"findings,omitempty"`
-	Warnings           []string                     `json:"warnings,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	BenchmarkVersion string    `json:"benchmark_version"`
+	GeneratedAt      time.Time `json:"generated_at"`
+	Hostname         string    `json:"hostname,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
+	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
+	// 100% = each added GPU contributes exactly its single-card throughput.
+	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
+	OverallStatus        string                       `json:"overall_status"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
+	Findings             []string                     `json:"findings,omitempty"`
+	Warnings             []string                     `json:"warnings,omitempty"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -84,35 +101,51 @@ type BenchmarkNormalizationGPU struct {
 }

 type BenchmarkGPUResult struct {
-	Index                  int                        `json:"index"`
-	UUID                   string                     `json:"uuid,omitempty"`
-	Name                   string                     `json:"name,omitempty"`
-	BusID                  string                     `json:"bus_id,omitempty"`
-	VBIOS                  string                     `json:"vbios,omitempty"`
-	ComputeCapability      string                     `json:"compute_capability,omitempty"`
-	Backend                string                     `json:"backend,omitempty"`
-	Status                 string                     `json:"status"`
-	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
-	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
-	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
+	Index               int     `json:"index"`
+	UUID                string  `json:"uuid,omitempty"`
+	Name                string  `json:"name,omitempty"`
+	BusID               string  `json:"bus_id,omitempty"`
+	VBIOS               string  `json:"vbios,omitempty"`
+	ComputeCapability   string  `json:"compute_capability,omitempty"`
+	Backend             string  `json:"backend,omitempty"`
+	Status              string  `json:"status"`
+	PowerLimitW         float64 `json:"power_limit_w,omitempty"`
+	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
+	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
+	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
+	// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
+	// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
+	ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
+	// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
+	// Fallback: 80°C.
+	SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
 	// the hardware default limit, which bee-gpu-burn cannot reach.
-	CalibratedPeakPowerW   float64                    `json:"calibrated_peak_power_w,omitempty"`
-	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
-	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
-	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
-	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
-	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
-	Baseline               BenchmarkTelemetrySummary  `json:"baseline"`
-	Steady                 BenchmarkTelemetrySummary  `json:"steady"`
-	Cooldown               BenchmarkTelemetrySummary  `json:"cooldown"`
-	Throttle               BenchmarkThrottleCounters  `json:"throttle_counters"`
-	PrecisionResults       []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
-	Scores                 BenchmarkScorecard         `json:"scores"`
-	DegradationReasons     []string                   `json:"degradation_reasons,omitempty"`
-	Notes                  []string                   `json:"notes,omitempty"`
+	CalibratedPeakPowerW   float64                         `json:"calibrated_peak_power_w,omitempty"`
+	CalibratedPeakTempC    float64                         `json:"calibrated_peak_temp_c,omitempty"`
+	PowerCalibrationTries  int                             `json:"power_calibration_tries,omitempty"`
+	MaxGraphicsClockMHz    float64                         `json:"max_graphics_clock_mhz,omitempty"`
+	BaseGraphicsClockMHz   float64                         `json:"base_graphics_clock_mhz,omitempty"`
+	MaxMemoryClockMHz      float64                         `json:"max_memory_clock_mhz,omitempty"`
+	LockedGraphicsClockMHz float64                         `json:"locked_graphics_clock_mhz,omitempty"`
+	LockedMemoryClockMHz   float64                         `json:"locked_memory_clock_mhz,omitempty"`
+	Baseline               BenchmarkTelemetrySummary       `json:"baseline"`
+	Steady                 BenchmarkTelemetrySummary       `json:"steady"`
+	PrecisionSteady        []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
+	PrecisionFailures      []string                        `json:"precision_failures,omitempty"`
+	Cooldown               BenchmarkTelemetrySummary       `json:"cooldown"`
+	Throttle               BenchmarkThrottleCounters       `json:"throttle_counters"`
+	// ECC error delta accumulated over the full benchmark (all phases combined).
+	ECC                BenchmarkECCCounters       `json:"ecc,omitempty"`
+	PrecisionResults   []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
+	Scores             BenchmarkScorecard         `json:"scores"`
+	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
+	Notes              []string                   `json:"notes,omitempty"`
+	// CoolingWarning is non-empty when a thermal throttle event occurred with
+	// a clock drop ≥20% while server fans were not at 100% duty cycle.
+	CoolingWarning string `json:"cooling_warning,omitempty"`
 }

 type BenchmarkTelemetrySummary struct {
@@ -142,6 +175,18 @@ type BenchmarkThrottleCounters struct {
 	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
 }

+// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
+// Corrected = single-bit errors fixed by ECC (DRAM degradation).
+// Uncorrected = double-bit errors that could not be corrected (serious fault).
+// Both are volatile (since last driver reset), not persistent.
+type BenchmarkECCCounters struct {
+	Corrected   uint64 `json:"corrected"`
+	Uncorrected uint64 `json:"uncorrected"`
+}
+
+func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
+func (e BenchmarkECCCounters) IsZero() bool  { return e.Corrected == 0 && e.Uncorrected == 0 }
+
 type BenchmarkPrecisionResult struct {
 	Name          string  `json:"name"`
 	Category      string  `json:"category"`
@@ -152,19 +197,52 @@ type BenchmarkPrecisionResult struct {
 	K             uint64  `json:"k,omitempty"`
 	Iterations    uint64  `json:"iterations,omitempty"`
 	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
-	Notes         string  `json:"notes,omitempty"`
+	// Weight is the fp32-equivalence factor for this precision category.
+	// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
+	// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
+	Weight                float64 `json:"weight,omitempty"`
+	WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
+	Notes                 string  `json:"notes,omitempty"`
 }

 type BenchmarkScorecard struct {
-	ComputeScore        float64 `json:"compute_score"`
+	ComputeScore float64 `json:"compute_score"`
+	// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
+	// steady phases (each precision ran alone, full GPU dedicated).
+	SyntheticScore float64 `json:"synthetic_score,omitempty"`
+	// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
+	// (all precisions competing simultaneously — closer to real workloads).
+	MixedScore float64 `json:"mixed_score,omitempty"`
+	// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
+	// sustains throughput under concurrent mixed-precision load.
+	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
-	StabilityScore      float64 `json:"stability_score"`
-	InterconnectScore   float64 `json:"interconnect_score"`
-	CompositeScore      float64 `json:"composite_score"`
+	// StabilityScore: fraction of steady-state time the GPU spent throttling
+	// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
+	StabilityScore float64 `json:"stability_score"`
+
+	// Throttle breakdown — percentage of steady-state time in each throttle type.
+	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
+	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
+	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
+	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
+
+	// Temperature headroom: distance to the 100°C destruction threshold.
+	// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
+	// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
+	TempHeadroomC float64 `json:"temp_headroom_c"`
+
+	InterconnectScore float64 `json:"interconnect_score"`
+	// ServerQualityScore (0–100) reflects server infrastructure quality independent
+	// of GPU model. Combines throttle time, power variance, and temp variance.
+	// Use this to compare servers with the same GPU, or to flag a bad server
+	// that throttles an otherwise fast GPU.
+	ServerQualityScore float64 `json:"server_quality_score"`
+	// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
+	// A throttling GPU will score lower here automatically — no quality multiplier.
+	CompositeScore float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
-	// Comparable across throttle levels and GPU generations. Low value at normal
-	// clocks indicates silicon degradation.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }

@@ -182,6 +260,22 @@ type BenchmarkServerPower struct {
 	Notes           []string `json:"notes,omitempty"`
 }

+// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
+// during a dedicated single-precision steady window.  Because only one kernel
+// type runs at a time the PowerCVPct here is a genuine stability signal.
+type BenchmarkPrecisionSteadyPhase struct {
+	Precision             string                    `json:"precision"` // e.g. "fp8", "fp16", "fp32"
+	Status                string                    `json:"status,omitempty"`
+	Steady                BenchmarkTelemetrySummary `json:"steady"`
+	TeraOpsPerSec         float64                   `json:"teraops_per_sec,omitempty"`
+	WeightedTeraOpsPerSec float64                   `json:"weighted_teraops_per_sec,omitempty"`
+	// ECC errors accumulated during this precision phase only.
+	// Non-zero corrected = stress-induced DRAM errors for this kernel type.
+	// Any uncorrected = serious fault triggered by this precision workload.
+	ECC   BenchmarkECCCounters `json:"ecc,omitempty"`
+	Notes string               `json:"notes,omitempty"`
+}
+
 type BenchmarkInterconnectResult struct {
 	Status             string   `json:"status"`
 	Attempted          bool     `json:"attempted"`
@@ -193,3 +287,78 @@ type BenchmarkInterconnectResult struct {
 	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
 	Notes              []string `json:"notes,omitempty"`
 }
+
+type NvidiaPowerBenchResult struct {
+	BenchmarkVersion     string                 `json:"benchmark_version"`
+	GeneratedAt          time.Time              `json:"generated_at"`
+	Hostname             string                 `json:"hostname,omitempty"`
+	ServerModel          string                 `json:"server_model,omitempty"`
+	BenchmarkProfile     string                 `json:"benchmark_profile"`
+	SelectedGPUIndices   []int                  `json:"selected_gpu_indices"`
+	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
+	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
+	OverallStatus        string                 `json:"overall_status"`
+	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
+	// cumulative thermal ramp. Represents the actual sustained power budget of
+	// this server under full GPU load. Use for rack power planning.
+	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
+	// ServerPower captures IPMI server power delta (idle→loaded) measured in
+	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
+	// actual wall-power draw as seen by the server's power supply.
+	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
+	Findings    []string              `json:"findings,omitempty"`
+	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
+}
+
+type NvidiaPowerBenchGPU struct {
+	Index              int     `json:"index"`
+	Name               string  `json:"name,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
+	// AppliedPowerLimitW is the stable limit found during single-card calibration.
+	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
+	// StablePowerLimitW is the final fixed limit for this GPU after the
+	// cumulative thermal ramp. This is the limit at which the GPU operated
+	// stably with all other GPUs running simultaneously at their own limits.
+	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
+	// additional derating.
+	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
+	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
+	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
+	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	Notes               []string `json:"notes,omitempty"`
+	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
+	CoolingWarning string `json:"cooling_warning,omitempty"`
+}
+
+type NvidiaPowerBenchStep struct {
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// NewGPUIndex is the GPU whose stable limit was searched in this step.
+	NewGPUIndex int `json:"new_gpu_index"`
+	// NewGPUStableLimitW is the stable power limit found for the new GPU.
+	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
+	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
+	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	Notes               []string `json:"notes,omitempty"`
+}
+
+// NvidiaPerformanceRampStep holds per-step performance data for the
+// scalability ramp-up phase of the performance benchmark.
+type NvidiaPerformanceRampStep struct {
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
+	// TOPS from dedicated single-precision phases) across all GPUs in this step.
+	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
+	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
+	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
+	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
+	ScalabilityPct float64  `json:"scalability_pct"`
+	Status         string   `json:"status"`
+	Notes          []string `json:"notes,omitempty"`
+}
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -13,14 +13,21 @@ import (

 // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
 type GPUMetricRow struct {
-	ElapsedSec  float64 `json:"elapsed_sec"`
-	GPUIndex    int     `json:"index"`
-	TempC       float64 `json:"temp_c"`
-	UsagePct    float64 `json:"usage_pct"`
-	MemUsagePct float64 `json:"mem_usage_pct"`
-	PowerW      float64 `json:"power_w"`
-	ClockMHz    float64 `json:"clock_mhz"`
-	MemClockMHz float64 `json:"mem_clock_mhz"`
+	Stage                 string  `json:"stage,omitempty"`
+	StageStartSec         float64 `json:"stage_start_sec,omitempty"`
+	StageEndSec           float64 `json:"stage_end_sec,omitempty"`
+	ElapsedSec            float64 `json:"elapsed_sec"`
+	GPUIndex              int     `json:"index"`
+	TempC                 float64 `json:"temp_c"`
+	UsagePct              float64 `json:"usage_pct"`
+	MemUsagePct           float64 `json:"mem_usage_pct"`
+	PowerW                float64 `json:"power_w"`
+	ClockMHz              float64 `json:"clock_mhz"`
+	MemClockMHz           float64 `json:"mem_clock_mhz"`
+	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
+	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
+	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -141,14 +148,28 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
+		dutyAvail := 0
+		if r.FanDutyCycleAvailable {
+			dutyAvail = 1
+		}
+		dutyEstimated := 0
+		if r.FanDutyCycleEstimated {
+			dutyEstimated = 1
+		}
+		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
+			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }

+type gpuMetricStageSpan struct {
+	Name  string
+	Start float64
+	End   float64
+}
+
 // WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
 func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 	// Group by GPU index preserving order.
@@ -163,9 +184,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
 	}

+	stageSpans := buildGPUMetricStageSpans(rows)
+	stageColorByName := make(map[string]string, len(stageSpans))
+	for i, span := range stageSpans {
+		stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
+	}
+
+	var legend strings.Builder
+	if len(stageSpans) > 0 {
+		legend.WriteString(`<div class="stage-legend">`)
+		for _, span := range stageSpans {
+			fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
+				stageColorByName[span.Name], gpuHTMLEscape(span.Name))
+		}
+		legend.WriteString(`</div>`)
+	}
+
 	var svgs strings.Builder
 	for _, gpuIdx := range order {
-		svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
+		svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
 		svgs.WriteString("\n")
 	}

@@ -175,21 +212,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
 <meta charset="utf-8">
 <title>GPU Stress Test Metrics</title>
 <style>
-body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
-h1 { text-align: center; color: #333; margin: 0 0 8px; }
-p  { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
+:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
+*{box-sizing:border-box}
+body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
+.page{padding:24px}
+.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
+.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
+.card-body{padding:16px}
+h1{font-size:22px;margin:0 0 6px}
+p{color:var(--muted);font-size:13px;margin:0 0 16px}
+.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
+.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
+.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
+.chart-block{margin-top:16px}
 </style>
 </head><body>
+<div class="page">
+<div class="card">
+<div class="card-head">GPU Stress Test Metrics</div>
+<div class="card-body">
 <h1>GPU Stress Test Metrics</h1>
 <p>Generated %s</p>
 %s
-</body></html>`, ts, svgs.String())
+<div class="chart-block">%s</div>
+</div>
+</div>
+</div>
+</body></html>`, ts, legend.String(), svgs.String())

 	return os.WriteFile(path, []byte(html), 0644)
 }

 // drawGPUChartSVG generates a self-contained SVG chart for one GPU.
-func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
+func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
 	// Layout
 	const W, H = 960, 520
 	const plotX1 = 120 // usage axis / chart left border
@@ -284,6 +339,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	}
 	b.WriteString("</g>\n")

+	// Stage backgrounds
+	for _, span := range stageSpans {
+		x1 := xv(span.Start)
+		x2 := xv(span.End)
+		if x2 < x1 {
+			x1, x2 = x2, x1
+		}
+		if x2-x1 < 1 {
+			x2 = x1 + 1
+		}
+		color := stageColorByName[span.Name]
+		fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
+			x1, plotY1, x2-x1, PH, color)
+		fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
+			x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
+	}
+
 	// Chart border
 	fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
 		` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
@@ -382,221 +454,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	return b.String()
 }

-const (
-	ansiAmber  = "\033[38;5;214m"
-	ansiReset  = "\033[0m"
-)
-
-const (
-	termChartWidth  = 70
-	termChartHeight = 12
-)
-
-// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
-// Used in SAT stress-test logs.
-func RenderGPUTerminalChart(rows []GPUMetricRow) string {
-	seen := make(map[int]bool)
-	var order []int
-	gpuMap := make(map[int][]GPUMetricRow)
-	for _, r := range rows {
-		if !seen[r.GPUIndex] {
-			seen[r.GPUIndex] = true
-			order = append(order, r.GPUIndex)
-		}
-		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
-	}
-
-	type seriesDef struct {
-		caption string
-		color   string
-		fn      func(GPUMetricRow) float64
-	}
-	defs := []seriesDef{
-		{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
-		{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
-		{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
-		{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
-	}
-
-	var b strings.Builder
-	for _, gpuIdx := range order {
-		gr := gpuMap[gpuIdx]
-		if len(gr) == 0 {
-			continue
-		}
-		tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
-		fmt.Fprintf(&b, "GPU %d — Stress Test Metrics  (%.0f seconds)\n\n", gpuIdx, tMax)
-		for _, d := range defs {
-			b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
-				termChartHeight, termChartWidth))
-			b.WriteRune('\n')
-		}
-	}
-
-	return strings.TrimRight(b.String(), "\n")
-}
-
-// renderLineChart draws a single time-series line chart using box-drawing characters.
-// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
-func renderLineChart(vals []float64, color, caption string, height, width int) string {
-	if len(vals) == 0 {
-		return caption + "\n"
-	}
-
-	mn, mx := gpuMinMax(vals)
-	if mn == mx {
-		mx = mn + 1
-	}
-
-	// Use the smaller of width or len(vals) to avoid stretching sparse data.
-	w := width
-	if len(vals) < w {
-		w = len(vals)
-	}
-	data := gpuDownsample(vals, w)
-
-	// row[i] = display row index: 0 = top = max value, height = bottom = min value.
-	row := make([]int, w)
-	for i, v := range data {
-		r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
-		if r < 0 {
-			r = 0
-		}
-		if r > height {
-			r = height
-		}
-		row[i] = r
-	}
-
-	// Fill the character grid.
-	grid := make([][]rune, height+1)
-	for i := range grid {
-		grid[i] = make([]rune, w)
-		for j := range grid[i] {
-			grid[i][j] = ' '
-		}
-	}
-	for x := 0; x < w; x++ {
-		r := row[x]
-		if x == 0 {
-			grid[r][0] = '─'
-			continue
-		}
-		p := row[x-1]
-		switch {
-		case r == p:
-			grid[r][x] = '─'
-		case r < p: // value went up (row index decreased toward top)
-			grid[r][x] = '╭'
-			grid[p][x] = '╯'
-			for y := r + 1; y < p; y++ {
-				grid[y][x] = '│'
-			}
-		default: // r > p, value went down
-			grid[p][x] = '╮'
-			grid[r][x] = '╰'
-			for y := p + 1; y < r; y++ {
-				grid[y][x] = '│'
-			}
-		}
-	}
-
-	// Y axis tick labels.
-	ticks := gpuNiceTicks(mn, mx, height/2)
-	tickAtRow := make(map[int]string)
-	labelWidth := 4
-	for _, t := range ticks {
-		r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
-		if r < 0 || r > height {
-			continue
-		}
-		s := gpuFormatTick(t)
-		tickAtRow[r] = s
-		if len(s) > labelWidth {
-			labelWidth = len(s)
-		}
-	}
-
-	var b strings.Builder
-	for r := 0; r <= height; r++ {
-		label := tickAtRow[r]
-		fmt.Fprintf(&b, "%*s", labelWidth, label)
-		switch {
-		case label != "":
-			b.WriteRune('┤')
-		case r == height:
-			b.WriteRune('┼')
-		default:
-			b.WriteRune('│')
-		}
-		b.WriteString(color)
-		b.WriteString(string(grid[r]))
-		b.WriteString(ansiReset)
-		b.WriteRune('\n')
-	}
-
-	// Bottom axis.
-	b.WriteString(strings.Repeat(" ", labelWidth))
-	b.WriteRune('└')
-	b.WriteString(strings.Repeat("─", w))
-	b.WriteRune('\n')
-
-	// Caption centered under the chart.
-	if caption != "" {
-		total := labelWidth + 1 + w
-		if pad := (total - len(caption)) / 2; pad > 0 {
-			b.WriteString(strings.Repeat(" ", pad))
-		}
-		b.WriteString(caption)
-		b.WriteRune('\n')
-	}
-
-	return b.String()
-}
-
-func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
-	v := make([]float64, len(rows))
-	for i, r := range rows {
-		v[i] = fn(r)
-	}
-	return v
-}
-
-// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
-func gpuDownsample(vals []float64, w int) []float64 {
-	n := len(vals)
-	if n == 0 {
-		return make([]float64, w)
-	}
-	result := make([]float64, w)
-	if n >= w {
-		counts := make([]int, w)
-		for i, v := range vals {
-			bucket := i * w / n
-			if bucket >= w {
-				bucket = w - 1
-			}
-			result[bucket] += v
-			counts[bucket]++
-		}
-		for i := range result {
-			if counts[i] > 0 {
-				result[i] /= float64(counts[i])
-			}
-		}
-	} else {
-		// Nearest-neighbour upsample.
-		for i := range result {
-			src := i * (n - 1) / (w - 1)
-			if src >= n {
-				src = n - 1
-			}
-			result[i] = vals[src]
-		}
-	}
-	return result
-}
-
 func gpuMinMax(vals []float64) (float64, float64) {
 	if len(vals) == 0 {
 		return 0, 1
@@ -641,3 +498,57 @@ func gpuFormatTick(v float64) string {
 	}
 	return strconv.FormatFloat(v, 'f', 1, 64)
 }
+
+var gpuMetricStagePalette = []string{
+	"#d95c5c",
+	"#2185d0",
+	"#21ba45",
+	"#f2c037",
+	"#6435c9",
+	"#00b5ad",
+	"#a5673f",
+}
+
+func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
+	var spans []gpuMetricStageSpan
+	for _, row := range rows {
+		name := strings.TrimSpace(row.Stage)
+		if name == "" {
+			name = "run"
+		}
+		start := row.StageStartSec
+		end := row.StageEndSec
+		if end <= start {
+			start = row.ElapsedSec
+			end = row.ElapsedSec
+		}
+		if len(spans) == 0 || spans[len(spans)-1].Name != name {
+			spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
+			continue
+		}
+		if start < spans[len(spans)-1].Start {
+			spans[len(spans)-1].Start = start
+		}
+		if end > spans[len(spans)-1].End {
+			spans[len(spans)-1].End = end
+		}
+	}
+	for i := range spans {
+		if spans[i].End <= spans[i].Start {
+			spans[i].End = spans[i].Start + 1
+		}
+	}
+	return spans
+}
+
+var gpuHTMLReplacer = strings.NewReplacer(
+	"&", "&amp;",
+	"<", "&lt;",
+	">", "&gt;",
+	`"`, "&quot;",
+	"'", "&#39;",
+)
+
+func gpuHTMLEscape(s string) string {
+	return gpuHTMLReplacer.Replace(s)
+}
--- a/audit/internal/platform/gpu_metrics_test.go
+++ b/audit/internal/platform/gpu_metrics_test.go
@@ -0,0 +1,65 @@
+package platform
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	path := filepath.Join(dir, "gpu-metrics.csv")
+	rows := []GPUMetricRow{
+		{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
+	}
+	if err := WriteGPUMetricsCSV(path, rows); err != nil {
+		t.Fatalf("WriteGPUMetricsCSV: %v", err)
+	}
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	text := string(raw)
+	for _, needle := range []string{
+		"stage,elapsed_sec,gpu_index",
+		`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
+	} {
+		if !strings.Contains(text, needle) {
+			t.Fatalf("csv missing %q\n%s", needle, text)
+		}
+	}
+}
+
+func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	path := filepath.Join(dir, "gpu-metrics.html")
+	rows := []GPUMetricRow{
+		{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
+		{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
+		{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
+		{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
+	}
+	if err := WriteGPUMetricsHTML(path, rows); err != nil {
+		t.Fatalf("WriteGPUMetricsHTML: %v", err)
+	}
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("ReadFile: %v", err)
+	}
+	text := string(raw)
+	for _, needle := range []string{
+		"stage-legend",
+		"baseline",
+		"steady-fp16",
+		"GPU Stress Test Metrics",
+	} {
+		if !strings.Contains(text, needle) {
+			t.Fatalf("html missing %q\n%s", needle, text)
+		}
+	}
+}
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -11,20 +11,10 @@ import (
 	"strings"
 )

+const installToRAMDir = "/dev/shm/bee-live"
+
 func (s *System) IsLiveMediaInRAM() bool {
-	fsType := mountFSType("/run/live/medium")
-	if fsType == "" {
-		// No medium mount at all — fall back to toram kernel parameter.
-		return toramActive()
-	}
-	if strings.EqualFold(fsType, "tmpfs") {
-		return true
-	}
-	// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
-	// mount of /run/live/medium fails (common for CD-ROM boots), the medium
-	// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
-	files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
-	return len(files) > 0
+	return s.LiveMediaRAMState().InRAM
 }

 func (s *System) LiveBootSource() LiveBootSource {
@@ -56,14 +46,95 @@ func (s *System) LiveBootSource() LiveBootSource {
 	return status
 }

-func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
+func (s *System) LiveMediaRAMState() LiveMediaRAMState {
+	return evaluateLiveMediaRAMState(
+		s.LiveBootSource(),
+		toramActive(),
+		globPaths("/run/live/medium/live/*.squashfs"),
+		globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
+	)
+}
+
+func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
+	state := LiveMediaRAMState{
+		LiveBootSource: status,
+		ToramActive:    toram,
+		CopyPresent:    len(copiedSquashfs) > 0,
+	}
+	if status.InRAM {
+		state.State = "in_ram"
+		state.Status = "ok"
+		state.CopyComplete = true
+		state.Message = "Running from RAM — installation media can be safely disconnected."
+		return state
+	}
+
+	expected := pathBaseSet(sourceSquashfs)
+	copied := pathBaseSet(copiedSquashfs)
+	state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
+
+	switch {
+	case state.CopyComplete:
+		state.State = "partial"
+		state.Status = "partial"
+		state.CanStartCopy = true
+		state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
+	case state.CopyPresent:
+		state.State = "partial"
+		state.Status = "partial"
+		state.CanStartCopy = true
+		state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
+	case toram:
+		state.State = "toram_failed"
+		state.Status = "failed"
+		state.CanStartCopy = true
+		state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
+	default:
+		state.State = "not_in_ram"
+		state.Status = "warning"
+		state.CanStartCopy = true
+		state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
+	}
+	return state
+}
+
+func globPaths(pattern string) []string {
+	matches, _ := filepath.Glob(pattern)
+	return matches
+}
+
+func pathBaseSet(paths []string) map[string]struct{} {
+	out := make(map[string]struct{}, len(paths))
+	for _, path := range paths {
+		base := strings.TrimSpace(filepath.Base(path))
+		if base != "" {
+			out[base] = struct{}{}
+		}
+	}
+	return out
+}
+
+func setContainsAll(have, want map[string]struct{}) bool {
+	if len(want) == 0 {
+		return false
+	}
+	for name := range want {
+		if _, ok := have[name]; !ok {
+			return false
+		}
+	}
+	return true
+}
+
+func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
 	log := func(msg string) {
 		if logFunc != nil {
 			logFunc(msg)
 		}
 	}

-	if s.IsLiveMediaInRAM() {
+	state := s.LiveMediaRAMState()
+	if state.InRAM {
 		log("Already running from RAM — installation media can be safely disconnected.")
 		return nil
 	}
@@ -88,10 +159,21 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 			humanBytes(needed+headroom), humanBytes(free))
 	}

-	dstDir := "/dev/shm/bee-live"
+	dstDir := installToRAMDir
+	if state.CopyPresent {
+		log("Removing stale partial RAM copy before retry...")
+	}
+	_ = os.RemoveAll(dstDir)
 	if err := os.MkdirAll(dstDir, 0755); err != nil {
 		return fmt.Errorf("create tmpfs dir: %v", err)
 	}
+	defer func() {
+		if retErr == nil {
+			return
+		}
+		_ = os.RemoveAll(dstDir)
+		log("Removed incomplete RAM copy.")
+	}()

 	for _, sf := range squashfsFiles {
 		if err := ctx.Err(); err != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -58,3 +58,46 @@ func TestDescribeLiveBootSource(t *testing.T) {
 		t.Fatalf("got %q want /run/live/medium", got)
 	}
 }
+
+func TestEvaluateLiveMediaRAMState(t *testing.T) {
+	t.Parallel()
+
+	t.Run("in_ram", func(t *testing.T) {
+		state := evaluateLiveMediaRAMState(
+			LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
+			false,
+			nil,
+			nil,
+		)
+		if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
+			t.Fatalf("state=%+v", state)
+		}
+	})
+
+	t.Run("partial_copy_after_cancel", func(t *testing.T) {
+		state := evaluateLiveMediaRAMState(
+			LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
+			false,
+			[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
+			[]string{"/dev/shm/bee-live/filesystem.squashfs"},
+		)
+		if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
+			t.Fatalf("state=%+v", state)
+		}
+		if state.CopyComplete {
+			t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
+		}
+	})
+
+	t.Run("toram_failed", func(t *testing.T) {
+		state := evaluateLiveMediaRAMState(
+			LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
+			true,
+			nil,
+			nil,
+		)
+		if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
+			t.Fatalf("state=%+v", state)
+		}
+	})
+}
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -28,6 +28,8 @@ var runtimeTrackedServices = []string{
 	"bee-audit",
 	"bee-web",
 	"bee-sshsetup",
+	"nvidia-dcgm",
+	"nvidia-fabricmanager",
 }

 func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
@@ -171,25 +173,28 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
 	return ToolStatus{Name: display}
 }

-// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
-// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
-// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
+// collectToRAMHealth evaluates whether the live system is fully running from RAM.
+// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
+// incomplete RAM copy exists but runtime still depends on the boot medium,
+// "failed" = toram was requested but medium is not in RAM.
 func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
-	inRAM := s.IsLiveMediaInRAM()
-	active := toramActive()
-	switch {
-	case inRAM:
-		health.ToRAMStatus = "ok"
-	case active:
-		// toram was requested but medium is not yet/no longer in RAM
-		health.ToRAMStatus = "failed"
+	state := s.LiveMediaRAMState()
+	health.ToRAMStatus = state.Status
+	switch state.Status {
+	case "ok":
+		return
+	case "failed":
 		health.Issues = append(health.Issues, schema.RuntimeIssue{
 			Code:        "toram_copy_failed",
 			Severity:    "warning",
-			Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
+			Description: state.Message,
+		})
+	case "partial":
+		health.Issues = append(health.Issues, schema.RuntimeIssue{
+			Code:        "toram_copy_partial",
+			Severity:    "warning",
+			Description: state.Message,
 		})
-	default:
-		health.ToRAMStatus = "warning"
 	}
 }

@@ -211,13 +216,13 @@ func findUSBExportMount() string {

 	// fs types that are expected on USB export drives
 	exportFSTypes := map[string]bool{
-		"vfat":  true,
-		"exfat": true,
-		"ext2":  true,
-		"ext3":  true,
-		"ext4":  true,
-		"ntfs":  true,
-		"ntfs3": true,
+		"vfat":    true,
+		"exfat":   true,
+		"ext2":    true,
+		"ext3":    true,
+		"ext4":    true,
+		"ntfs":    true,
+		"ntfs3":   true,
 		"fuseblk": true,
 	}

--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -108,15 +108,15 @@ type nvidiaGPUHealth struct {
 }

 type nvidiaGPUStatusFile struct {
-	Index       int
-	Name        string
-	RunStatus   string
-	Reason      string
-	Health      string
-	HealthRaw   string
-	Observed    bool
-	Selected    bool
-	FailingJob  string
+	Index      int
+	Name       string
+	RunStatus  string
+	Reason     string
+	Health     string
+	HealthRaw  string
+	Observed   bool
+	Selected   bool
+	FailingJob string
 }

 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	return string(raw), err
 }

-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	// detect GPU count
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }

@@ -410,13 +412,13 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
-			satJob{
-				name:       "03-dcgmproftester.log",
-				cmd:        profCmd,
-				env:        profEnv,
-				collectGPU: true,
-				gpuIndices: selected,
-			},
+		satJob{
+			name:       "03-dcgmproftester.log",
+			cmd:        profCmd,
+			env:        profEnv,
+			collectGPU: true,
+			gpuIndices: selected,
+		},
 		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	), logFunc)
 }
@@ -426,6 +428,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -443,6 +452,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -460,6 +476,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -552,9 +575,19 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
+	// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
+	// intentionally conservative enough for healthy systems while avoiding the
+	// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
+	timeoutSec := sizeMB*passes*20/100 + 60
+	if timeoutSec < 180 {
+		timeoutSec = 180
+	}
+	if timeoutSec > 900 {
+		timeoutSec = 900
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
-		{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
+		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
 		{name: "03-free-after.log", cmd: []string{"free", "-h"}},
 	}, logFunc)
 }
@@ -1382,8 +1415,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
 	if len(metricRows) > 0 {
 		_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
 		_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
-		chart := RenderGPUTerminalChart(metricRows)
-		_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
 	}

 	return out, err
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
 	UpdatedAt time.Time
 }

+type fanObservationState struct {
+	MaxRPM map[string]float64 `json:"max_rpm"`
+}
+
+type fanPeakCandidate struct {
+	FirstSeen time.Time
+	RPM       float64
+}
+
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
+	fanObservationMu   sync.Mutex
+	fanObservation     fanObservationState
+	fanObservationInit bool
+	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )

 const systemPowerHoldTTL = 15 * time.Second

+var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
+
+const fanObservationMinPeakHold = time.Second
+
+func normalizeObservedFanMaxRPM(rpm float64) float64 {
+	if rpm <= 0 {
+		return 0
+	}
+	return math.Ceil(rpm/1000.0) * 1000.0
+}
+
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
+			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
+		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }

+func loadFanObservationLocked() {
+	if fanObservationInit {
+		return
+	}
+	fanObservationInit = true
+	fanObservation.MaxRPM = make(map[string]float64)
+	raw, err := os.ReadFile(fanObservationStatePath)
+	if err != nil || len(raw) == 0 {
+		return
+	}
+	var persisted fanObservationState
+	if json.Unmarshal(raw, &persisted) != nil {
+		return
+	}
+	for name, rpm := range persisted.MaxRPM {
+		name = strings.TrimSpace(name)
+		if name == "" || rpm <= 0 {
+			continue
+		}
+		fanObservation.MaxRPM[name] = rpm
+	}
+}
+
+func saveFanObservationLocked() {
+	if len(fanObservation.MaxRPM) == 0 {
+		return
+	}
+	dir := filepath.Dir(fanObservationStatePath)
+	if dir == "" || dir == "." {
+		dir = "/var/log/bee-sat"
+	}
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return
+	}
+	raw, err := json.MarshalIndent(fanObservation, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
+}
+
+func updateFanObservation(fans []FanReading, now time.Time) {
+	if len(fans) == 0 {
+		return
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	changed := false
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		currentMax := fanObservation.MaxRPM[name]
+		if fan.RPM <= currentMax {
+			delete(fanPeakCandidates, name)
+			continue
+		}
+		if cand, ok := fanPeakCandidates[name]; ok {
+			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
+				newMax := math.Max(cand.RPM, fan.RPM)
+				if newMax > currentMax {
+					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
+					changed = true
+				}
+				delete(fanPeakCandidates, name)
+				continue
+			}
+			if fan.RPM > cand.RPM {
+				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
+			}
+			continue
+		}
+		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
+	}
+	if changed {
+		saveFanObservationLocked()
+	}
+}
+
+func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
+	if len(fans) == 0 {
+		return 0, false
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	var samples []float64
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		maxRPM := fanObservation.MaxRPM[name]
+		if maxRPM <= 0 {
+			continue
+		}
+		pct := fan.RPM / maxRPM * 100.0
+		if pct > 100 {
+			pct = 100
+		}
+		if pct < 0 {
+			pct = 0
+		}
+		samples = append(samples, pct)
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -426,6 +566,116 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
 	return fans, nil
 }

+// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
+// Returns the average duty cycle across all exposed PWM controls.
+func sampleFanDutyCyclePct() (float64, bool, bool) {
+	out, err := exec.Command("sensors", "-j").Output()
+	if err != nil || len(out) == 0 {
+		fans, fanErr := sampleFanSpeeds()
+		if fanErr != nil {
+			return 0, false, false
+		}
+		return sampleFanDutyCyclePctFromFans(fans)
+	}
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
+	return pct, ok, false
+}
+
+func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
+	if len(fans) == 0 {
+		return 0, false, false
+	}
+	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
+		return pct, true, true
+	}
+	return 0, false, false
+}
+
+func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
+	var doc map[string]map[string]any
+	if err := json.Unmarshal(raw, &doc); err != nil {
+		return 0, false
+	}
+	var samples []float64
+	for _, features := range doc {
+		for name, feature := range features {
+			if strings.EqualFold(name, "Adapter") {
+				continue
+			}
+			featureMap, ok := feature.(map[string]any)
+			if !ok {
+				continue
+			}
+			if duty, ok := firstFanDutyValue(name, featureMap); ok {
+				samples = append(samples, duty)
+			}
+		}
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
+func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
+	featureName = strings.ToLower(strings.TrimSpace(featureName))
+	if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
+		return 0, false
+	}
+	if strings.Contains(featureName, "pwm") {
+		for _, key := range []string{"input", "value", "current"} {
+			if value, ok := feature[key]; ok {
+				if duty, parsed := parseFanDutyValue(value); parsed {
+					return duty, true
+				}
+			}
+		}
+	}
+	keys := make([]string, 0, len(feature))
+	for key := range feature {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		lower := strings.ToLower(key)
+		if !strings.Contains(lower, "pwm") {
+			continue
+		}
+		if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
+			continue
+		}
+		if duty, parsed := parseFanDutyValue(feature[key]); parsed {
+			return duty, true
+		}
+	}
+	return 0, false
+}
+
+func parseFanDutyValue(value any) (float64, bool) {
+	switch v := value.(type) {
+	case float64:
+		return normalizePWMAsDutyPct(v)
+	case string:
+		if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
+			return normalizePWMAsDutyPct(f)
+		}
+	}
+	return 0, false
+}
+
+func normalizePWMAsDutyPct(raw float64) (float64, bool) {
+	if raw < 0 {
+		return 0, false
+	}
+	if raw <= 100 {
+		return raw, true
+	}
+	if raw <= 255 {
+		return raw / 255.0 * 100.0, true
+	}
+	return 0, false
+}
+
 func firstFanInputValue(feature map[string]any) (float64, bool) {
 	keys := make([]string, 0, len(feature))
 	for key := range feature {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"path/filepath"
 	"testing"
 	"time"
 )
@@ -29,6 +30,74 @@ func TestFirstFanInputValue(t *testing.T) {
 	}
 }

+func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
+	raw := []byte(`{
+		"chip0": {
+			"fan1": {"input": 9000},
+			"pwm1": {"input": 128},
+			"pwm1_enable": {"input": 1}
+		},
+		"chip1": {
+			"pwm2": {"input": 64}
+		}
+	}`)
+
+	got, ok := parseFanDutyCyclePctSensorsJSON(raw)
+	if !ok {
+		t.Fatalf("expected duty cycle telemetry to be parsed")
+	}
+	if got < 57 || got > 58 {
+		t.Fatalf("got=%v want ~57.1", got)
+	}
+}
+
+func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
+	t.Parallel()
+
+	oldPath := fanObservationStatePath
+	oldState := fanObservation
+	oldInit := fanObservationInit
+	oldCandidates := fanPeakCandidates
+	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	t.Cleanup(func() {
+		fanObservationStatePath = oldPath
+		fanObservation = oldState
+		fanObservationInit = oldInit
+		fanPeakCandidates = oldCandidates
+	})
+
+	start := time.Unix(100, 0)
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
+	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
+		t.Fatalf("single-sample spike should not establish observed max")
+	}
+
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
+
+	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected estimated duty cycle from persisted observed max")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("got=%v want ~43.3", got)
+	}
+
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected persisted observed max to be reloaded from disk")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("reloaded got=%v want ~43.3", got)
+	}
+}
+
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
+	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
+	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
+	if len(cmd) != len(want) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -9,6 +9,17 @@ type LiveBootSource struct {
 	Device string `json:"device,omitempty"`
 }

+type LiveMediaRAMState struct {
+	LiveBootSource
+	State        string `json:"state"`
+	Status       string `json:"status"`
+	ToramActive  bool   `json:"toram_active,omitempty"`
+	CopyPresent  bool   `json:"copy_present,omitempty"`
+	CopyComplete bool   `json:"copy_complete,omitempty"`
+	CanStartCopy bool   `json:"can_start_copy,omitempty"`
+	Message      string `json:"message,omitempty"`
+}
+
 type InterfaceInfo struct {
 	Name  string
 	State string
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -15,17 +15,17 @@ type HardwareIngestRequest struct {
 }

 type RuntimeHealth struct {
-	Status        string                 `json:"status"`
-	CheckedAt     string                 `json:"checked_at"`
-	ExportDir     string                 `json:"export_dir,omitempty"`
-	DriverReady   bool                   `json:"driver_ready,omitempty"`
-	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
-	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
-	NetworkStatus string                 `json:"network_status,omitempty"`
-	// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
-	ToRAMStatus   string `json:"toram_status,omitempty"`
+	Status        string `json:"status"`
+	CheckedAt     string `json:"checked_at"`
+	ExportDir     string `json:"export_dir,omitempty"`
+	DriverReady   bool   `json:"driver_ready,omitempty"`
+	CUDAReady     bool   `json:"cuda_ready,omitempty"`
+	NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
+	NetworkStatus string `json:"network_status,omitempty"`
+	// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
+	ToRAMStatus string `json:"toram_status,omitempty"`
 	// USBExportPath: mount point of the first writable USB drive found, empty if none.
-	USBExportPath string `json:"usb_export_path,omitempty"`
+	USBExportPath string                 `json:"usb_export_path,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
 	Services      []RuntimeServiceStatus `json:"services,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -36,6 +36,16 @@ var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, err
 	return a.ListNvidiaGPUStatuses()
 }

+const (
+	taskPriorityBenchmark      = 10
+	taskPriorityBurn           = 20
+	taskPriorityValidateStress = 30
+	taskPriorityValidate       = 40
+	taskPriorityAudit          = 50
+	taskPriorityInstallToRAM   = 60
+	taskPriorityInstall        = 70
+)
+
 // ── Job ID counter ────────────────────────────────────────────────────────────

 var jobCounter atomic.Uint64
@@ -100,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {

 func shouldSplitHomogeneousNvidiaTarget(target string) bool {
 	switch strings.TrimSpace(target) {
-	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
 		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
 		"nvidia-bandwidth", "nvidia-stress":
 		return true
@@ -109,6 +119,30 @@ func shouldSplitHomogeneousNvidiaTarget(target string) bool {
 	}
 }

+func defaultTaskPriority(target string, params taskParams) int {
+	switch strings.TrimSpace(target) {
+	case "install":
+		return taskPriorityInstall
+	case "install-to-ram":
+		return taskPriorityInstallToRAM
+	case "audit":
+		return taskPriorityAudit
+	case "nvidia-bench-perf", "nvidia-bench-power":
+		return taskPriorityBenchmark
+	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
+		return taskPriorityBurn
+	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
+		"amd", "amd-mem", "amd-bandwidth":
+		if params.StressMode {
+			return taskPriorityValidateStress
+		}
+		return taskPriorityValidate
+	default:
+		return 0
+	}
+}
+
 func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
 	if len(gpus) == 0 {
 		return nil, fmt.Errorf("no NVIDIA GPUs detected")
@@ -458,6 +492,7 @@ func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
 		ID:        newJobID("audit"),
 		Name:      "Audit",
 		Target:    "audit",
+		Priority:  defaultTaskPriority("audit", taskParams{}),
 		Status:    TaskPending,
 		CreatedAt: time.Now(),
 	}
@@ -491,13 +526,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 			return
 		}

-			var body struct {
-				Duration           int      `json:"duration"`
-				StressMode         bool     `json:"stress_mode"`
-				GPUIndices         []int    `json:"gpu_indices"`
-				ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
-				StaggerGPUStart    bool     `json:"stagger_gpu_start"`
-				Loader             string   `json:"loader"`
+		var body struct {
+			Duration           int      `json:"duration"`
+			StressMode         bool     `json:"stress_mode"`
+			GPUIndices         []int    `json:"gpu_indices"`
+			ExcludeGPUIndices  []int    `json:"exclude_gpu_indices"`
+			StaggerGPUStart    bool     `json:"stagger_gpu_start"`
+			ParallelGPUs       bool     `json:"parallel_gpus"`
+			Loader             string   `json:"loader"`
 			Profile            string   `json:"profile"`
 			DisplayName        string   `json:"display_name"`
 			PlatformComponents []string `json:"platform_components"`
@@ -513,18 +549,147 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		if strings.TrimSpace(body.DisplayName) != "" {
 			name = body.DisplayName
 		}
-			params := taskParams{
-				Duration:           body.Duration,
-				StressMode:         body.StressMode,
-				GPUIndices:         body.GPUIndices,
-				ExcludeGPUIndices:  body.ExcludeGPUIndices,
-				StaggerGPUStart:    body.StaggerGPUStart,
-				Loader:             body.Loader,
+		params := taskParams{
+			Duration:           body.Duration,
+			StressMode:         body.StressMode,
+			GPUIndices:         body.GPUIndices,
+			ExcludeGPUIndices:  body.ExcludeGPUIndices,
+			StaggerGPUStart:    body.StaggerGPUStart,
+			ParallelGPUs:       body.ParallelGPUs,
+			Loader:             body.Loader,
 			BurnProfile:        body.Profile,
 			DisplayName:        body.DisplayName,
 			PlatformComponents: body.PlatformComponents,
 		}
-		tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
+		tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "sat-"+target)
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		for _, t := range tasks {
+			globalQueue.enqueue(t)
+		}
+		writeTaskRunResponse(w, tasks)
+	}
+}
+
+func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if h.opts.App == nil {
+			writeError(w, http.StatusServiceUnavailable, "app not configured")
+			return
+		}
+
+		var body struct {
+			Profile           string `json:"profile"`
+			SizeMB            int    `json:"size_mb"`
+			GPUIndices        []int  `json:"gpu_indices"`
+			ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
+			RunNCCL           *bool  `json:"run_nccl"`
+			ParallelGPUs      *bool  `json:"parallel_gpus"`
+			RampUp            *bool  `json:"ramp_up"`
+			DisplayName       string `json:"display_name"`
+		}
+		if r.Body != nil {
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+				writeError(w, http.StatusBadRequest, "invalid request body")
+				return
+			}
+		}
+
+		runNCCL := true
+		if body.RunNCCL != nil {
+			runNCCL = *body.RunNCCL
+		}
+		parallelGPUs := false
+		if body.ParallelGPUs != nil {
+			parallelGPUs = *body.ParallelGPUs
+		}
+		rampUp := false
+		if body.RampUp != nil {
+			rampUp = *body.RampUp
+		}
+		// Build a descriptive base name that includes profile and mode so the task
+		// list is self-explanatory without opening individual task detail pages.
+		profile := strings.TrimSpace(body.Profile)
+		if profile == "" {
+			profile = "standard"
+		}
+		name := taskDisplayName(target, "", "")
+		if strings.TrimSpace(body.DisplayName) != "" {
+			name = body.DisplayName
+		}
+		// Append profile tag.
+		name = fmt.Sprintf("%s · %s", name, profile)
+
+		if target == "nvidia-bench-power" && parallelGPUs {
+			writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
+			return
+		}
+
+		if rampUp && len(body.GPUIndices) > 1 {
+			// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
+			// in Phase 2 (one additional GPU per step). A single task with all
+			// selected GPUs is sufficient — spawning N tasks with growing subsets
+			// would repeat all earlier steps redundantly.
+			gpus, err := apiListNvidiaGPUs(h.opts.App)
+			if err != nil {
+				writeError(w, http.StatusBadRequest, err.Error())
+				return
+			}
+			resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
+			if err != nil {
+				writeError(w, http.StatusBadRequest, err.Error())
+				return
+			}
+			if len(resolved) < 2 {
+				// Fall through to normal single-task path.
+				rampUp = false
+			} else {
+				now := time.Now()
+				rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
+				taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
+				t := &Task{
+					ID:        newJobID("bee-bench-nvidia"),
+					Name:      taskName,
+					Target:    target,
+					Priority:  defaultTaskPriority(target, taskParams{}),
+					Status:    TaskPending,
+					CreatedAt: now,
+					params: taskParams{
+						GPUIndices:       append([]int(nil), resolved...),
+						SizeMB:           body.SizeMB,
+						BenchmarkProfile: body.Profile,
+						RunNCCL:          runNCCL,
+						ParallelGPUs:     true,
+						RampTotal:        len(resolved),
+						RampRunID:        rampRunID,
+						DisplayName:      taskName,
+					},
+				}
+				globalQueue.enqueue(t)
+				writeTaskRunResponse(w, []*Task{t})
+				return
+			}
+		}
+
+		// For non-ramp tasks append mode tag.
+		if parallelGPUs {
+			name = fmt.Sprintf("%s · parallel", name)
+		} else {
+			name = fmt.Sprintf("%s · sequential", name)
+		}
+
+		params := taskParams{
+			GPUIndices:        body.GPUIndices,
+			ExcludeGPUIndices: body.ExcludeGPUIndices,
+			SizeMB:            body.SizeMB,
+			BenchmarkProfile:  body.Profile,
+			RunNCCL:           runNCCL,
+			ParallelGPUs:      parallelGPUs,
+			DisplayName:       body.DisplayName,
+		}
+		tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
 		if err != nil {
 			writeError(w, http.StatusBadRequest, err.Error())
 			return
@@ -537,129 +702,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 }

 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
-	if h.opts.App == nil {
-		writeError(w, http.StatusServiceUnavailable, "app not configured")
-		return
-	}
-
-	var body struct {
-		Profile           string `json:"profile"`
-		SizeMB            int    `json:"size_mb"`
-		GPUIndices        []int  `json:"gpu_indices"`
-		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
-		RunNCCL           *bool  `json:"run_nccl"`
-		ParallelGPUs      *bool  `json:"parallel_gpus"`
-		RampUp            *bool  `json:"ramp_up"`
-		DisplayName       string `json:"display_name"`
-	}
-	if r.Body != nil {
-		if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
-			writeError(w, http.StatusBadRequest, "invalid request body")
-			return
-		}
-	}
-
-	runNCCL := true
-	if body.RunNCCL != nil {
-		runNCCL = *body.RunNCCL
-	}
-	parallelGPUs := false
-	if body.ParallelGPUs != nil {
-		parallelGPUs = *body.ParallelGPUs
-	}
-	rampUp := false
-	if body.RampUp != nil {
-		rampUp = *body.RampUp
-	}
-	// Build a descriptive base name that includes profile and mode so the task
-	// list is self-explanatory without opening individual task detail pages.
-	profile := strings.TrimSpace(body.Profile)
-	if profile == "" {
-		profile = "standard"
-	}
-	name := taskDisplayName("nvidia-benchmark", "", "")
-	if strings.TrimSpace(body.DisplayName) != "" {
-		name = body.DisplayName
-	}
-	// Append profile tag.
-	name = fmt.Sprintf("%s · %s", name, profile)
-
-	if rampUp && len(body.GPUIndices) > 1 {
-		// Ramp-up mode: resolve GPU list, then create one task per prefix
-		// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
-		gpus, err := apiListNvidiaGPUs(h.opts.App)
-		if err != nil {
-			writeError(w, http.StatusBadRequest, err.Error())
-			return
-		}
-		resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
-		if err != nil {
-			writeError(w, http.StatusBadRequest, err.Error())
-			return
-		}
-		if len(resolved) < 2 {
-			// Fall through to normal single-task path.
-			rampUp = false
-		} else {
-			now := time.Now()
-			rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
-			var allTasks []*Task
-			for step := 1; step <= len(resolved); step++ {
-				subset := resolved[:step]
-				stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
-				t := &Task{
-					ID:        newJobID("benchmark-nvidia"),
-					Name:      stepName,
-					Target:    "nvidia-benchmark",
-					Priority:  15,
-					Status:    TaskPending,
-					CreatedAt: now,
-					params: taskParams{
-						GPUIndices:       append([]int(nil), subset...),
-						SizeMB:           body.SizeMB,
-						BenchmarkProfile: body.Profile,
-						RunNCCL:          runNCCL && step == len(resolved),
-						ParallelGPUs:     true,
-						RampStep:         step,
-						RampTotal:        len(resolved),
-						RampRunID:        rampRunID,
-						DisplayName:      stepName,
-					},
-				}
-				allTasks = append(allTasks, t)
-			}
-			for _, t := range allTasks {
-				globalQueue.enqueue(t)
-			}
-			writeTaskRunResponse(w, allTasks)
-			return
-		}
-	}
-
-	// For non-ramp tasks append mode tag.
-	if parallelGPUs {
-		name = fmt.Sprintf("%s · parallel", name)
-	} else {
-		name = fmt.Sprintf("%s · sequential", name)
-	}
-
-	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
-		GPUIndices:        body.GPUIndices,
-		ExcludeGPUIndices: body.ExcludeGPUIndices,
-		SizeMB:            body.SizeMB,
-		BenchmarkProfile:  body.Profile,
-		RunNCCL:           runNCCL,
-		ParallelGPUs:      parallelGPUs,
-		DisplayName:       body.DisplayName,
-	}, name, h.opts.App, "benchmark-nvidia")
-	if err != nil {
-		writeError(w, http.StatusBadRequest, err.Error())
-		return
-	}
-	for _, t := range tasks {
-		globalQueue.enqueue(t)
-	}
-	writeTaskRunResponse(w, tasks)
+	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }

 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
@@ -694,6 +737,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			now := time.Now()
 			t.DoneAt = &now
@@ -1034,25 +1080,62 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
-	status := h.opts.App.LiveBootSource()
+	status := h.currentRAMStatus()
 	w.Header().Set("Content-Type", "application/json")
 	_ = json.NewEncoder(w).Encode(status)
 }

+type ramStatusResponse struct {
+	platform.LiveMediaRAMState
+	InstallTaskActive bool   `json:"install_task_active,omitempty"`
+	CopyTaskActive    bool   `json:"copy_task_active,omitempty"`
+	CanStartTask      bool   `json:"can_start_task,omitempty"`
+	BlockedReason     string `json:"blocked_reason,omitempty"`
+}
+
+func (h *handler) currentRAMStatus() ramStatusResponse {
+	state := h.opts.App.LiveMediaRAMState()
+	resp := ramStatusResponse{LiveMediaRAMState: state}
+	if globalQueue.hasActiveTarget("install") {
+		resp.InstallTaskActive = true
+		resp.BlockedReason = "install to disk is already running"
+		return resp
+	}
+	if globalQueue.hasActiveTarget("install-to-ram") {
+		resp.CopyTaskActive = true
+		resp.BlockedReason = "install to RAM task is already pending or running"
+		return resp
+	}
+	if state.InRAM {
+		resp.BlockedReason = "system is already running from RAM"
+		return resp
+	}
+	resp.CanStartTask = state.CanStartCopy
+	if !resp.CanStartTask && resp.BlockedReason == "" {
+		resp.BlockedReason = state.Message
+	}
+	return resp
+}
+
 func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
-	if globalQueue.hasActiveTarget("install") {
-		writeError(w, http.StatusConflict, "install to disk is already running")
+	status := h.currentRAMStatus()
+	if !status.CanStartTask {
+		msg := strings.TrimSpace(status.BlockedReason)
+		if msg == "" {
+			msg = "install to RAM is not available"
+		}
+		writeError(w, http.StatusConflict, msg)
 		return
 	}
 	t := &Task{
 		ID:        newJobID("install-to-ram"),
 		Name:      "Install to RAM",
 		Target:    "install-to-ram",
-		Priority:  10,
+		Priority:  defaultTaskPriority("install-to-ram", taskParams{}),
 		Status:    TaskPending,
 		CreatedAt: time.Now(),
 	}
@@ -1167,7 +1250,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
 		ID:        newJobID("install"),
 		Name:      "Install to Disk",
 		Target:    "install",
-		Priority:  20,
+		Priority:  defaultTaskPriority("install", taskParams{}),
 		Status:    TaskPending,
 		CreatedAt: time.Now(),
 		params: taskParams{
@@ -1443,6 +1526,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
 	writeJSON(w, map[string]string{"status": "rolled back"})
 }

+func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
+}
+
 func (h *handler) rollbackPendingNetworkChange() error {
 	h.pendingNetMu.Lock()
 	pnc := h.pendingNet
@@ -1459,4 +1547,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
 	}
 	return nil
 }
-
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -39,6 +39,9 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
 		t.Fatalf("burn profile=%q want smoke", got)
 	}
+	if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
+		t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
+	}
 }

 func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
@@ -61,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })

 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
-	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
 	rec := httptest.NewRecorder()

 	h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -75,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
-	if task.Target != "nvidia-benchmark" {
-		t.Fatalf("target=%q want nvidia-benchmark", task.Target)
+	if task.Target != "nvidia-bench-perf" {
+		t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
 	}
 	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
 		t.Fatalf("gpu indices=%v want [1 3]", got)
@@ -84,6 +87,9 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	if task.params.RunNCCL {
 		t.Fatal("RunNCCL should reflect explicit false from request")
 	}
+	if task.Priority != taskPriorityBenchmark {
+		t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
+	}
 }

 func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
@@ -107,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })

 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
-	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
 	rec := httptest.NewRecorder()

 	h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -133,6 +139,56 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
 		t.Fatalf("task[1] gpu indices=%v want [2]", got)
 	}
+	if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
+		t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
+	}
+	if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
+		t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
+	}
+}
+
+func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H100 PCIe"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 3 {
+		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
+	}
+	for i, task := range globalQueue.tasks {
+		if task.Target != "nvidia-bench-power" {
+			t.Fatalf("task[%d] target=%q", i, task.Target)
+		}
+		if task.Priority != taskPriorityBenchmark {
+			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
+		}
+	}
 }

 func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
@@ -175,6 +231,41 @@ func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
 	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
 		t.Fatalf("task[1] gpu indices=%v want [2]", got)
 	}
+	if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
+		t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
+	}
+	if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
+		t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
+	}
+}
+
+func TestDefaultTaskPriorityOrder(t *testing.T) {
+	got := []int{
+		defaultTaskPriority("install-to-ram", taskParams{}),
+		defaultTaskPriority("audit", taskParams{}),
+		defaultTaskPriority("cpu", taskParams{}),
+		defaultTaskPriority("cpu", taskParams{StressMode: true}),
+		defaultTaskPriority("nvidia-stress", taskParams{}),
+		defaultTaskPriority("nvidia-bench-perf", taskParams{}),
+		defaultTaskPriority("nvidia-bench-power", taskParams{}),
+	}
+	want := []int{
+		taskPriorityInstallToRAM,
+		taskPriorityAudit,
+		taskPriorityValidate,
+		taskPriorityValidateStress,
+		taskPriorityBurn,
+		taskPriorityBenchmark,
+		taskPriorityBenchmark,
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
+		}
+	}
+	if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
+		t.Fatalf("priority order=%v", got)
+	}
 }

 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
-	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -72,6 +72,13 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
 .badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
 .badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
 .badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Component chips — one small square per device */
+.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
+.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
+.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
 /* Output terminal */
 .terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
 .terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
@@ -363,23 +370,25 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
 			html.EscapeString(label), html.EscapeString(value), badgeHTML))
 	}

-	cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
-	writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
+	writeRow("CPU", hwDescribeCPU(hw),
+		renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))

-	memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
-	writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
+	writeRow("Memory", hwDescribeMemory(hw),
+		renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))

-	storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
-	writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
+	writeRow("Storage", hwDescribeStorage(hw),
+		renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))

-	gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
-	writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
+	writeRow("GPU", hwDescribeGPU(hw),
+		renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))

-	psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
-	if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
-		psuRow.Status = hwPSUStatus(hw.PowerSupplies)
+	psuMatched := matchedRecords(records, nil, []string{"psu:"})
+	if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
+		// No PSU records yet — synthesise a single chip from IPMI status.
+		psuStatus := hwPSUStatus(hw.PowerSupplies)
+		psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
 	}
-	writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
+	writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))

 	if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
 		writeRow("Network", nicDesc, "")
@@ -845,6 +854,13 @@ func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
 			Source: "live-boot / /proc/mounts",
 			Issue:  "",
 		}
+	case "partial":
+		return runtimeHealthRow{
+			Title:  "LiveCD in RAM",
+			Status: "WARNING",
+			Source: "live-boot / /proc/mounts / /dev/shm/bee-live",
+			Issue:  "Partial or staged RAM copy detected. System is not fully running from RAM; Copy to RAM can be retried.",
+		}
 	case "failed":
 		return runtimeHealthRow{
 			Title:  "LiveCD in RAM",
@@ -885,6 +901,31 @@ func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
 	}
 }

+// matchedRecords returns all ComponentStatusRecord entries whose key matches
+// any exact key or any of the given prefixes. Used for per-device chip rendering.
+func firstNonEmpty(vals ...string) string {
+	for _, v := range vals {
+		if v != "" {
+			return v
+		}
+	}
+	return ""
+}
+
+func matchedRecords(records []app.ComponentStatusRecord, exact []string, prefixes []string) []app.ComponentStatusRecord {
+	var matched []app.ComponentStatusRecord
+	for _, rec := range records {
+		key := strings.TrimSpace(rec.ComponentKey)
+		if key == "" {
+			continue
+		}
+		if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
+			matched = append(matched, rec)
+		}
+	}
+	return matched
+}
+
 func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
 	matched := make([]app.ComponentStatusRecord, 0)
 	for _, rec := range records {
@@ -1027,6 +1068,52 @@ func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) str
 	return strings.Join(messages, "; ")
 }

+// chipLetterClass maps a component status to a single display letter and CSS class.
+func chipLetterClass(status string) (letter, cls string) {
+	switch strings.ToUpper(strings.TrimSpace(status)) {
+	case "OK":
+		return "O", "chip-ok"
+	case "WARNING", "WARN", "PARTIAL":
+		return "W", "chip-warn"
+	case "CRITICAL", "FAIL", "FAILED", "ERROR":
+		return "F", "chip-fail"
+	default:
+		return "?", "chip-unknown"
+	}
+}
+
+// renderComponentChips renders one 20×20 chip per ComponentStatusRecord.
+// Hover tooltip shows component key, status, error summary and last check time.
+// Falls back to a single unknown chip when no records are available.
+func renderComponentChips(matched []app.ComponentStatusRecord) string {
+	if len(matched) == 0 {
+		return `<span class="chips"><span class="chip chip-unknown" title="No data">?</span></span>`
+	}
+	sort.Slice(matched, func(i, j int) bool {
+		return matched[i].ComponentKey < matched[j].ComponentKey
+	})
+	var b strings.Builder
+	b.WriteString(`<span class="chips">`)
+	for _, rec := range matched {
+		letter, cls := chipLetterClass(rec.Status)
+		var tooltip strings.Builder
+		tooltip.WriteString(rec.ComponentKey)
+		tooltip.WriteString(": ")
+		tooltip.WriteString(firstNonEmpty(rec.Status, "UNKNOWN"))
+		if rec.ErrorSummary != "" {
+			tooltip.WriteString(" — ")
+			tooltip.WriteString(rec.ErrorSummary)
+		}
+		if !rec.LastCheckedAt.IsZero() {
+			fmt.Fprintf(&tooltip, " (checked %s)", rec.LastCheckedAt.Format("15:04:05"))
+		}
+		fmt.Fprintf(&b, `<span class="chip %s" title="%s">%s</span>`,
+			cls, html.EscapeString(tooltip.String()), letter)
+	}
+	b.WriteString(`</span>`)
+	return b.String()
+}
+
 func runtimeStatusBadge(status string) string {
 	status = strings.ToUpper(strings.TrimSpace(status))
 	badge := "badge-unknown"
@@ -1332,7 +1419,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.Memory,
 			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
-			`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
+			`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
@@ -1394,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
 			`<code>all_reduce_perf</code> (NCCL tests)`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-bandwidth">` +
@@ -1402,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
 			`<code>nvbandwidth</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
 		)) +
 		`</div>` +
 		`</div>
@@ -1440,8 +1527,6 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
-    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
-    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1689,7 +1774,7 @@ function runAllSAT() {
  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -1928,23 +2013,10 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {

 // ── Benchmark ─────────────────────────────────────────────────────────────────

-type benchmarkHistoryColumn struct {
-	key      string
-	label    string
-	name     string
-	index    int
-	parallel bool
-}
-
-type benchmarkHistoryCell struct {
-	score   float64
-	present bool
-}
-
 type benchmarkHistoryRun struct {
 	generatedAt time.Time
 	displayTime string
-	cells       map[string]benchmarkHistoryCell
+	gpuScores   map[int]float64 // GPU index → composite score
 }

 func renderBenchmark(opts HandlerOptions) string {
@@ -1952,7 +2024,7 @@ func renderBenchmark(opts HandlerOptions) string {

 <div class="grid2">
  <div class="card">
-    <div class="card-head">NVIDIA Benchmark</div>
+    <div class="card-head">Benchmark Setup</div>
    <div class="card-body">
      <div class="form-row">
        <label>Profile</label>
@@ -1985,26 +2057,30 @@ func renderBenchmark(opts HandlerOptions) string {
        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
      </label>
      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
-      <button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
+      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
+        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
+        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
+      </div>
+      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
    </div>
  </div>

  <div class="card">
-    <div class="card-head">Method</div>
+    <div class="card-head">Method Split</div>
    <div class="card-body">
-      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
+      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
      <table>
-        <tr><th>Profile</th><th>Purpose</th></tr>
-        <tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
-        <tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
-        <tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
+        <tr><th>Run Type</th><th>Engine</th><th>Question</th></tr>
+        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr>
+        <tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr>
      </table>
+      <p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
    </div>
  </div>
 </div>

-` + renderBenchmarkResultsCard(opts.ExportDir) + `
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `

 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2042,21 +2118,24 @@ function benchmarkMode() {

 function benchmarkUpdateSelectionNote() {
  const selected = benchmarkSelectedGPUIndices();
-  const btn = document.getElementById('benchmark-run-btn');
+  const perfBtn = document.getElementById('benchmark-run-performance-btn');
+  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
  const note = document.getElementById('benchmark-selection-note');
  if (!selected.length) {
-    btn.disabled = true;
+    perfBtn.disabled = true;
+    fitBtn.disabled = true;
    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
    return;
  }
-  btn.disabled = false;
+  perfBtn.disabled = false;
+  fitBtn.disabled = false;
  const mode = benchmarkMode();
  if (mode === 'ramp-up') {
-    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.';
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses targeted_power per step.';
  } else if (mode === 'parallel') {
-    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : '');
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
  } else {
-    note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : '');
+    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
  }
 }

@@ -2130,7 +2209,7 @@ function benchmarkSelectNone() {
  benchmarkUpdateSelectionNote();
 }

-function runNvidiaBenchmark() {
+function runNvidiaBenchmark(kind) {
  const selected = benchmarkSelectedGPUIndices();
  const status = document.getElementById('benchmark-run-status');
  if (!selected.length) {
@@ -2140,21 +2219,26 @@ function runNvidiaBenchmark() {
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  const mode = benchmarkMode();
  const rampUp = mode === 'ramp-up' && selected.length > 1;
-  const parallelGPUs = mode === 'parallel';
+  const parallelGPUs = mode === 'parallel' && kind === 'performance';
+  if (kind === 'power-fit' && mode === 'parallel') {
+    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
+    return;
+  }
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
-    run_nccl: selected.length > 1,
+    run_nccl: kind === 'performance' && selected.length > 1,
    parallel_gpus: parallelGPUs,
    ramp_up: rampUp,
-    display_name: 'NVIDIA Benchmark'
+    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
-  document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
+  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
  const term = document.getElementById('benchmark-terminal');
-  term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
+  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
  status.textContent = 'Queueing...';
-  fetch('/api/benchmark/nvidia/run', {
+  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
+  fetch(endpoint, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(body)
@@ -2182,7 +2266,9 @@ function runNvidiaBenchmark() {
        if (e.data) failures += 1;
        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
        term.scrollTop = term.scrollHeight;
+        const isLast = (idx + 1 >= taskIds.length);
        streamNext(idx + 1, failures);
+        if (isLast) { benchmarkRefreshResults(); }
      });
      benchmarkES.onerror = function() {
        if (benchmarkES) {
@@ -2202,21 +2288,33 @@ function runNvidiaBenchmark() {
 }

 benchmarkLoadGPUs();
+
+function benchmarkRefreshResults() {
+  fetch('/api/benchmark/results')
+    .then(function(r) { return r.text(); })
+    .then(function(html) {
+      const el = document.getElementById('benchmark-results-section');
+      if (el) el.innerHTML = html;
+    })
+    .catch(function() {});
+}
 </script>`
 }

 func renderBenchmarkResultsCard(exportDir string) string {
-	columns, runs := loadBenchmarkHistory(exportDir)
-	return renderBenchmarkResultsCardFromRuns(
-		"Benchmark Results",
+	maxIdx, runs := loadBenchmarkHistory(exportDir)
+	perf := renderBenchmarkResultsCardFromRuns(
+		"Performance Results",
 		"Composite score by saved benchmark run and GPU.",
-		"No saved benchmark runs yet.",
-		columns,
+		"No saved performance benchmark runs yet.",
+		maxIdx,
 		runs,
 	)
+	power := renderPowerBenchmarkResultsCard(exportDir)
+	return perf + "\n" + power
 }

-func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, columns []benchmarkHistoryColumn, runs []benchmarkHistoryRun) string {
+func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
 	if len(runs) == 0 {
 		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
 	}
@@ -2226,22 +2324,22 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
 	}
 	b.WriteString(`<div style="overflow-x:auto">`)
-	b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
-	for _, col := range columns {
-		b.WriteString(`<th>` + html.EscapeString(col.label) + `</th>`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
+	for i := 0; i <= maxGPUIndex; i++ {
+		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
 	}
 	b.WriteString(`</tr></thead><tbody>`)
 	for i, run := range runs {
 		b.WriteString(`<tr>`)
 		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
-		for _, col := range columns {
-			cell, ok := run.cells[col.key]
-			if !ok || !cell.present {
+		for idx := 0; idx <= maxGPUIndex; idx++ {
+			score, ok := run.gpuScores[idx]
+			if !ok {
 				b.WriteString(`<td style="color:var(--muted)">-</td>`)
 				continue
 			}
-			b.WriteString(`<td>` + fmt.Sprintf("%.2f", cell.score) + `</td>`)
+			b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
 		}
 		b.WriteString(`</tr>`)
 	}
@@ -2249,22 +2347,22 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 	return b.String()
 }

-func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
-	baseDir := app.DefaultBenchmarkBaseDir
+func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
+	baseDir := app.DefaultBeeBenchPerfDir
 	if strings.TrimSpace(exportDir) != "" {
-		baseDir = filepath.Join(exportDir, "bee-benchmark")
+		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
 	}
-	paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
+	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
 	if err != nil || len(paths) == 0 {
-		return nil, nil
+		return -1, nil
 	}
 	sort.Strings(paths)
 	return loadBenchmarkHistoryFromPaths(paths)
 }

-func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
-	columnByKey := make(map[string]benchmarkHistoryColumn)
+func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
 	runs := make([]benchmarkHistoryRun, 0, len(paths))
+	maxGPUIndex := -1
 	for _, path := range paths {
 		raw, err := os.ReadFile(path)
 		if err != nil {
@@ -2277,108 +2375,147 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 		run := benchmarkHistoryRun{
 			generatedAt: result.GeneratedAt,
 			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
-			cells:       make(map[string]benchmarkHistoryCell),
+			gpuScores:   make(map[int]float64),
 		}
-
-		if result.ParallelGPUs {
-			// All GPUs ran simultaneously — one column per server, score = avg composite.
-			gpuModelCount := make(map[string]int)
-			for _, gpu := range result.GPUs {
-				gpuModelCount[strings.TrimSpace(gpu.Name)]++
-			}
-			scoreSum := make(map[string]float64)
-			scoreCnt := make(map[string]int)
-			for _, gpu := range result.GPUs {
-				key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
-				scoreSum[key] += gpu.Scores.CompositeScore
-				scoreCnt[key]++
-				count := gpuModelCount[strings.TrimSpace(gpu.Name)]
-				columnByKey[key] = benchmarkHistoryColumn{
-					key:      key,
-					label:    benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
-					name:     strings.TrimSpace(gpu.Name),
-					index:    -1,
-					parallel: true,
-				}
-			}
-			for key, sum := range scoreSum {
-				run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
-			}
-		} else {
-			// Each GPU ran independently — one column per GPU index.
-			for _, gpu := range result.GPUs {
-				key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
-				columnByKey[key] = benchmarkHistoryColumn{
-					key:      key,
-					label:    benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
-					name:     strings.TrimSpace(gpu.Name),
-					index:    gpu.Index,
-					parallel: false,
-				}
-				run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
+		for _, gpu := range result.GPUs {
+			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
+			if gpu.Index > maxGPUIndex {
+				maxGPUIndex = gpu.Index
 			}
 		}
 		runs = append(runs, run)
 	}
-
-	columns := make([]benchmarkHistoryColumn, 0, len(columnByKey))
-	for _, col := range columnByKey {
-		columns = append(columns, col)
-	}
-	// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
-	sort.Slice(columns, func(i, j int) bool {
-		if columns[i].parallel != columns[j].parallel {
-			return !columns[i].parallel // sequential first
-		}
-		if columns[i].parallel {
-			li := strings.ToLower(columns[i].label)
-			lj := strings.ToLower(columns[j].label)
-			if li != lj {
-				return li < lj
-			}
-			return columns[i].key < columns[j].key
-		}
-		// Sequential: sort by GPU index, then name.
-		if columns[i].index != columns[j].index {
-			return columns[i].index < columns[j].index
-		}
-		return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
-	})
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
 	})
-	return columns, runs
+	return maxGPUIndex, runs
 }

-// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
-func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
-	gpuName = strings.TrimSpace(gpuName)
-	if gpuName == "" {
-		gpuName = "Unknown GPU"
+func renderPowerBenchmarkResultsCard(exportDir string) string {
+	baseDir := app.DefaultBeeBenchPowerDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "power")
 	}
-	return fmt.Sprintf("GPU #%d — %s", index, gpuName)
-}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
+	}
+	sort.Strings(paths)

-// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
-// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
-func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
-	serverModel = strings.TrimSpace(serverModel)
-	gpuName = strings.TrimSpace(gpuName)
-	if gpuName == "" {
-		gpuName = "Unknown GPU"
+	type powerRun struct {
+		generatedAt time.Time
+		displayTime string
+		result      platform.NvidiaPowerBenchResult
 	}
-	gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
-	if serverModel == "" {
-		return gpuPart
+	var runs []powerRun
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var r platform.NvidiaPowerBenchResult
+		if err := json.Unmarshal(raw, &r); err != nil {
+			continue
+		}
+		runs = append(runs, powerRun{
+			generatedAt: r.GeneratedAt,
+			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			result:      r,
+		})
 	}
-	return fmt.Sprintf("%s — %s", serverModel, gpuPart)
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+
+	// Show only the most recent run's GPU slot table, plus a run history summary.
+	var b strings.Builder
+	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
+
+	latest := runs[0].result
+	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
+	if latest.Hostname != "" {
+		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
+	}
+	if latest.OverallStatus != "" {
+		statusColor := "var(--ok)"
+		if latest.OverallStatus != "OK" {
+			statusColor = "var(--warn)"
+		}
+		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
+	}
+	b.WriteString(`</p>`)
+
+	if len(latest.GPUs) > 0 {
+		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`</tr></thead><tbody>`)
+		for _, gpu := range latest.GPUs {
+			derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
+			rowStyle := ""
+			achievedStyle := ""
+			if derated {
+				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
+				achievedStyle = ` style="color:#e6a000;font-weight:600"`
+			}
+			statusLabel := gpu.Status
+			if statusLabel == "" {
+				statusLabel = "OK"
+			}
+			statusColor := "var(--ok)"
+			if statusLabel != "OK" {
+				statusColor = "var(--warn)"
+			}
+			nominalStr := "-"
+			if gpu.DefaultPowerLimitW > 0 {
+				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
+			}
+			achievedStr := "-"
+			if gpu.AppliedPowerLimitW > 0 {
+				achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			p95Str := "-"
+			if gpu.MaxObservedPowerW > 0 {
+				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
+			}
+			b.WriteString(`<tr` + rowStyle + `>`)
+			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
+			b.WriteString(`<td>` + nominalStr + `</td>`)
+			b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
+			b.WriteString(`<td>` + p95Str + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div>`)
+	}
+
+	if len(runs) > 1 {
+		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
+		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
+		for i, run := range runs {
+			statusColor := "var(--ok)"
+			if run.result.OverallStatus != "OK" {
+				statusColor = "var(--warn)"
+			}
+			b.WriteString(`<tr>`)
+			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div></details>`)
+	}
+
+	b.WriteString(`</div></div>`)
+	return b.String()
 }

 // ── Burn ──────────────────────────────────────────────────────────────────────

 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="card" style="margin-bottom:16px">
@@ -3338,12 +3475,19 @@ fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
  else if (kind === 'disk') label = 'disk (' + source + ')';
  else label = source;
  boot.textContent = 'Current boot source: ' + label + '.';
-  if (d.in_ram) {
-    txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
+  txt.textContent = d.message || 'Checking...';
+  if (d.status === 'ok' || d.in_ram) {
    txt.style.color = 'var(--ok, green)';
+  } else if (d.status === 'failed') {
+    txt.style.color = 'var(--err, #b91c1c)';
  } else {
-    txt.textContent = 'Live media is mounted from installation device. Copy to RAM to allow media removal.';
+    txt.style.color = 'var(--muted)';
+  }
+  if (d.can_start_task) {
    btn.style.display = '';
+    btn.disabled = false;
+  } else {
+    btn.style.display = 'none';
  }
 });
 function installToRAM() {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -261,7 +261,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
-	mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
+	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
+	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
+	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)

 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -11,6 +11,7 @@ import (
 	"time"

 	"bee/audit/internal/platform"
+	"bee/audit/internal/schema"
 )

 func TestChartLegendNumber(t *testing.T) {
@@ -78,6 +79,16 @@ func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
 	}
 }

+func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
+	row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
+	if row.Status != "WARNING" {
+		t.Fatalf("status=%q want WARNING", row.Status)
+	}
+	if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
+		t.Fatalf("issue=%q", row.Issue)
+	}
+}
+
 func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
@@ -637,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 		`href="/benchmark"`,
 		`id="benchmark-gpu-list"`,
 		`/api/gpu/nvidia`,
-		`/api/benchmark/nvidia/run`,
+		`/api/bee-bench/nvidia/perf/run`,
+		`/api/bee-bench/nvidia/power/run`,
 		`benchmark-run-nccl`,
+		`Run Performance Benchmark`,
+		`Run Power / Thermal Fit`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -649,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	dir := t.TempDir()
 	exportDir := filepath.Join(dir, "export")
-	runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		t.Fatal(err)
 	}
@@ -691,10 +705,10 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	body := rec.Body.String()
 	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
 	for _, needle := range []string{
-		`Benchmark Results`,
+		`Perf Results`,
 		`Composite score by saved benchmark run and GPU.`,
-		`GPU #0 — NVIDIA H100 PCIe`,
-		`GPU #1 — NVIDIA H100 PCIe`,
+		`GPU 0`,
+		`GPU 1`,
 		`#1`,
 		wantTime,
 		`1176.25`,
@@ -730,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	}
 }

+func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`NVIDIA Interconnect (NCCL)`,
+		`Runs in Validate and Stress.`,
+		`NVIDIA Bandwidth (NVBandwidth)`,
+		`Intended to stay short enough for Validate.`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("validate page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
@@ -1113,8 +1147,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
 		`>Storage<`,
 		`>GPU<`,
 		`>PSU<`,
-		`badge-warn`,   // cpu Warning badge
-		`badge-err`,    // storage Critical badge
+		`badge-warn`, // cpu Warning badge
+		`badge-err`,  // storage Critical badge
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("dashboard missing %q: %s", needle, body)
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
 		b.WriteString(benchmarkCard)
 	}
+	if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
+		b.WriteString(powerCard)
+	}

 	if len(report.Charts) > 0 {
 		for _, chart := range report.Charts {
@@ -251,7 +254,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 }

 func renderTaskBenchmarkResultsCard(target, logText string) string {
-	if strings.TrimSpace(target) != "nvidia-benchmark" {
+	switch strings.TrimSpace(target) {
+	case "nvidia-bench-perf":
+	default:
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
@@ -263,7 +268,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
 		return ""
 	}
 	return renderBenchmarkResultsCardFromRuns(
-		"Benchmark Results",
+		"Perf Results",
 		"Composite score for this benchmark task.",
 		"No benchmark results were saved for this task.",
 		columns,
@@ -271,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
 	)
 }

+func renderTaskPowerResultsCard(target, logText string) string {
+	if strings.TrimSpace(target) != "nvidia-bench-power" {
+		return ""
+	}
+	resultPath := taskBenchmarkResultPath(logText)
+	if strings.TrimSpace(resultPath) == "" {
+		return ""
+	}
+	raw, err := os.ReadFile(resultPath)
+	if err != nil {
+		return ""
+	}
+	var result platform.NvidiaPowerBenchResult
+	if err := json.Unmarshal(raw, &result); err != nil {
+		return ""
+	}
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
+	if len(result.RecommendedSlotOrder) > 0 {
+		b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
+	}
+	b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
+	for _, gpu := range result.GPUs {
+		fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
+			gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
+	}
+	b.WriteString(`</table></div></div>`)
+	return b.String()
+}
+
 func taskBenchmarkResultPath(logText string) string {
 	archivePath := taskArchivePathFromLog(logText)
 	if archivePath == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
-	if runDir == archivePath {
-		return ""
-	}
 	return filepath.Join(runDir, "result.json")
 }

--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -32,7 +32,8 @@ const (
 var taskNames = map[string]string{
 	"nvidia":                 "NVIDIA SAT",
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
-	"nvidia-benchmark":       "NVIDIA Benchmark",
+	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
+	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -161,6 +162,32 @@ type nvidiaRampSpec struct {
 	TotalDurationSec int
 }

+func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
+	switch strings.TrimSpace(strings.ToLower(profile)) {
+	case "overnight":
+		return 1024, 2
+	case "acceptance":
+		return 1024, 1
+	case "smoke":
+		return 256, 1
+	}
+	if stress {
+		return 512, 1
+	}
+	return 256, 1
+}
+
+func taskMayLeaveOrphanWorkers(target string) bool {
+	switch strings.TrimSpace(strings.ToLower(target)) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
+		"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
+		return true
+	default:
+		return false
+	}
+}
+
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -628,7 +655,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = 300
 		}
 		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
-	case "nvidia-benchmark":
+	case "nvidia-bench-perf":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
@@ -644,6 +671,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
+	case "nvidia-bench-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
+		}, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -696,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
-		if t.params.BurnProfile != "" && dur <= 0 {
-			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
-		}
-		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
-			DurationSec: dur,
-			Loader:      platform.NvidiaStressLoaderNCCL,
-			GPUIndices:  t.params.GPUIndices,
-		}, j.append)
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -737,10 +769,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		sizeMB, passes := 256, 1
-		if t.params.StressMode {
-			sizeMB, passes = 1024, 3
-		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
@@ -996,6 +1026,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1023,6 +1056,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1127,10 +1163,13 @@ func (q *taskQueue) loadLocked() {
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
 			// The task was interrupted by a bee-web restart. Child processes
-			// (e.g. bee-gpu-burn-worker) survive the restart in their own
-			// process groups and cannot be cancelled retroactively. Mark the
-			// task as failed so the user can decide whether to re-run it
-			// rather than blindly re-launching duplicate workers.
+			// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
+			// their own process groups. Kill any matching stale workers before
+			// marking the task failed so the next GPU test does not inherit a
+			// busy DCGM slot or duplicate workers.
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				_ = platform.KillTestWorkers()
+			}
 			now := time.Now()
 			t.Status = TaskFailed
 			t.DoneAt = &now
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })

-	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
 		t.Fatal(err)
 	}
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	}
 	task := &Task{
 		ID:           "task-bench",
-		Name:         "NVIDIA Benchmark",
-		Target:       "nvidia-benchmark",
+		Name:         "NVIDIA Bee Bench Perf",
+		Target:       "nvidia-bench-perf",
 		Status:       TaskDone,
 		CreatedAt:    time.Now().UTC().Add(-time.Minute),
 		ArtifactsDir: artifactsDir,
 	}
 	ensureTaskReportPaths(task)
-	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
 	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -420,9 +420,9 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	}
 	html := string(body)
 	for _, needle := range []string{
-		`Benchmark Results`,
+		`Perf Results`,
 		`Composite score for this benchmark task.`,
-		`GPU #0 — NVIDIA H100 PCIe`,
+		`GPU 0`,
 		`1176.25`,
 	} {
 		if !strings.Contains(html, needle) {
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
 	}
 }

+func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
+	var gotSizeMB, gotPasses int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "mem-validate-1",
+		Name:      "Memory SAT",
+		Target:    "memory",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{StressMode: true},
+	}
+	j := &jobState{}
+
+	orig := runMemoryAcceptancePackCtx
+	runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
+		gotSizeMB = sizeMB
+		gotPasses = passes
+		return "/tmp/memory-validate.tar.gz", nil
+	}
+	defer func() { runMemoryAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotSizeMB != 512 || gotPasses != 1 {
+		t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
+	}
+}
+
 func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -1,5 +1,34 @@
 # Benchmark clock calibration research

+## Benchmark methodology versioning
+
+Every benchmark methodology change must bump the benchmark version constant in
+source code by exactly `+1`.
+
+Methodology change means any change that affects comparability of benchmark
+results, including for example:
+- phase durations or phase order
+- enabled/disabled precisions
+- fallback rules
+- normalization rules
+- score formulas or weights
+- degradation thresholds
+- power calibration logic
+- thermal/power penalty logic
+
+Requirements:
+- benchmark version must be stored in source code as an explicit version
+  constant, not inferred from git tag or build metadata
+- benchmark report must always print the benchmark version
+- `result.json` must always include the benchmark version
+- results from different benchmark versions must be treated as non-comparable by
+  default
+
+Purpose:
+- prevent accidental comparison of runs produced by different methodologies
+- make historical benchmark archives self-describing even when detached from git
+- force deliberate version bumps whenever scoring or execution semantics change
+
 ## Status
 In progress. Baseline data from production servers pending.

--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -1,12 +1,13 @@
 DEBIAN_VERSION=12
 DEBIAN_KERNEL_ABI=auto
 NVIDIA_DRIVER_VERSION=590.48.01
+NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
 NCCL_VERSION=2.28.9-1
 NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
 NCCL_TESTS_VERSION=2.13.10
 NVCC_VERSION=12.8
-CUBLAS_VERSION=13.0.2.14-1
+CUBLAS_VERSION=13.1.1.3-1
 CUDA_USERSPACE_VERSION=13.0.96-1
 DCGM_VERSION=4.5.3-1
 JOHN_JUMBO_COMMIT=67fcf9fe5a
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
 COMGR_VERSION=2.8.0.60304-76~22.04
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
+MEMTEST_VERSION=6.10-4
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -23,9 +23,9 @@ lb config noauto \
    --bootloaders "grub-efi,syslinux" \
    --debian-installer none \
    --archive-areas "main contrib non-free non-free-firmware" \
-    --mirror-bootstrap "https://deb.debian.org/debian" \
-    --mirror-chroot "https://deb.debian.org/debian" \
-    --mirror-binary "https://deb.debian.org/debian" \
+    --mirror-bootstrap "http://mirror.mephi.ru/debian/" \
+    --mirror-chroot "http://mirror.mephi.ru/debian/" \
+    --mirror-binary "http://mirror.mephi.ru/debian/" \
    --security true \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
@@ -33,6 +33,7 @@ lb config noauto \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --debootstrap-options "--include=ca-certificates" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -33,9 +33,10 @@ typedef void *CUstream;
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
 #define MAX_STRESS_STREAMS 16
-#define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define MAX_SINGLE_PRECISION_STREAMS 4
+#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)

 static const char *ptx_source =
    ".version 6.0\n"
@@ -297,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }

+static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
+    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
+        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
+    }
+    return profile_budget_bytes;
+}
+
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -643,6 +651,20 @@ static const struct profile_desc k_profiles[] = {
        CUDA_R_16F,
        CUBLAS_COMPUTE_32F_FAST_16F,
    },
+    {
+        "int8_tensor",
+        "int8",
+        75,
+        1,
+        0,
+        0,
+        128,
+        CUDA_R_8I,
+        CUDA_R_8I,
+        CUDA_R_32I,
+        CUDA_R_32I,
+        CUBLAS_COMPUTE_32I,
+    },
    {
        "fp8_e4m3",
        "fp8",
@@ -689,6 +711,21 @@ static const struct profile_desc k_profiles[] = {
 #endif
 };

+#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
+
+static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
+    if (!(desc->enabled && cc >= desc->min_cc)) {
+        return 0;
+    }
+    if (precision_filter != NULL) {
+        return strcmp(desc->block_label, precision_filter) == 0;
+    }
+    /* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
+     * unstable on the current benchmark fleet and can abort the whole mixed
+     * pass after earlier phases already collected useful telemetry. */
+    return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
+}
+
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -759,10 +796,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
 static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
    switch (type) {
        case CUDA_R_32F:
+        case CUDA_R_32I:
            return (size_t)(elements * 4u);
        case CUDA_R_16F:
        case CUDA_R_16BF:
            return (size_t)(elements * 2u);
+        case CUDA_R_8I:
        case CUDA_R_8F_E4M3:
        case CUDA_R_8F_E5M2:
            return (size_t)(elements);
@@ -775,6 +814,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
    }
 }

+static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
+    if (desc->compute_type == CUBLAS_COMPUTE_32I) {
+        return CUDA_R_32I;
+    }
+    if (desc->compute_type == CUBLAS_COMPUTE_64F) {
+        return CUDA_R_64F;
+    }
+    return CUDA_R_32F;
+}
+
 static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
    uint64_t row_tiles = (rows + 127u) / 128u;
    uint64_t col_tiles = (cols + 63u) / 64u;
@@ -881,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
-    memset(out, 0, sizeof(*out));
-    out->desc = *desc;
-    out->stream = stream;
-
    size_t bytes_per_cell = 0;
+    size_t attempt_budget = profile_budget_bytes;
+
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -894,105 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }

-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
-    out->m = dim;
-    out->n = dim;
-    out->k = dim;
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
+        memset(out, 0, sizeof(*out));
+        out->desc = *desc;
+        out->stream = stream;

-    size_t desired_workspace = profile_budget_bytes / 8u;
-    if (desired_workspace > 32u * 1024u * 1024u) {
-        desired_workspace = 32u * 1024u * 1024u;
-    }
-    desired_workspace = round_down_size(desired_workspace, 256u);
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
+        out->m = dim;
+        out->n = dim;
+        out->k = dim;

-    size_t a_bytes = 0;
-    size_t b_bytes = 0;
-    size_t c_bytes = 0;
-    size_t d_bytes = 0;
-    size_t scale_bytes = 0;
-    while (1) {
-        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
-        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
-        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
-        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
-        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+        size_t desired_workspace = attempt_budget / 8u;
+        if (desired_workspace > 32u * 1024u * 1024u) {
+            desired_workspace = 32u * 1024u * 1024u;
+        }
+        desired_workspace = round_down_size(desired_workspace, 256u);

-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
-        if (matrix_bytes <= profile_budget_bytes) {
-            size_t remaining = profile_budget_bytes - matrix_bytes;
-            out->workspace_size = desired_workspace;
-            if (out->workspace_size > remaining) {
-                out->workspace_size = round_down_size(remaining, 256u);
+        size_t a_bytes = 0;
+        size_t b_bytes = 0;
+        size_t c_bytes = 0;
+        size_t d_bytes = 0;
+        size_t scale_bytes = 0;
+        while (1) {
+            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
+            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
+            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
+            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
+            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+
+            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+            if (matrix_bytes <= attempt_budget) {
+                size_t remaining = attempt_budget - matrix_bytes;
+                out->workspace_size = desired_workspace;
+                if (out->workspace_size > remaining) {
+                    out->workspace_size = round_down_size(remaining, 256u);
+                }
+                break;
            }
-            break;
+
+            if (out->m <= (uint64_t)desc->min_multiple) {
+                break;
+            }
+            out->m -= (uint64_t)desc->min_multiple;
+            out->n = out->m;
+            out->k = out->m;
+        }
+        if (out->m < (uint64_t)desc->min_multiple) {
+            attempt_budget /= 2u;
+            continue;
        }

-        if (out->m <= (uint64_t)desc->min_multiple) {
-            return 0;
-        }
-        out->m -= (uint64_t)desc->min_multiple;
-        out->n = out->m;
-        out->k = out->m;
-    }
-
-    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (!check_cublas("cublasLtMatmulDescCreate",
-                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cublasOperation_t transa = CUBLAS_OP_T;
-    cublasOperation_t transb = CUBLAS_OP_N;
-    if (!check_cublas("set TRANSA",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSA,
-                                                             &transa,
-                                                             sizeof(transa))) ||
-        !check_cublas("set TRANSB",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSB,
-                                                             &transb,
-                                                             sizeof(transb)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (desc->needs_scalar_scale) {
-        float one = 1.0f;
-        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
-            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+
+        cudaDataType_t scale_type = matmul_scale_type(desc);
+        if (!check_cublas("cublasLtMatmulDescCreate",
+                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
-        if (!check_cublas("set A scale ptr",
+
+        cublasOperation_t transa = CUBLAS_OP_T;
+        cublasOperation_t transb = CUBLAS_OP_N;
+        if (!check_cublas("set TRANSA",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
-                                                                 &a_scale_ptr,
-                                                                 sizeof(a_scale_ptr))) ||
-            !check_cublas("set B scale ptr",
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
+                                                                 &transa,
+                                                                 sizeof(transa))) ||
+            !check_cublas("set TRANSB",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
-                                                                 &b_scale_ptr,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
+                                                                 &transb,
+                                                                 sizeof(transb)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-    }
+
+        if (desc->needs_scalar_scale) {
+            float one = 1.0f;
+            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
+                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+            if (!check_cublas("set A scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                     &a_scale_ptr,
+                                                                     sizeof(a_scale_ptr))) ||
+                !check_cublas("set B scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                     &b_scale_ptr,
+                                                                     sizeof(b_scale_ptr)))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }

 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
    if (desc->needs_block_scale) {
@@ -1032,78 +1089,94 @@ static int prepare_profile(struct cublaslt_api *cublas,
    }
 #endif

-    if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (out->workspace_size > 0) {
-        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+        if (!check_cublas("create A layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+            !check_cublas("create B layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+            !check_cublas("create C layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+            !check_cublas("create D layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
+
+        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        if (out->workspace_size > 0) {
+            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }
+
+        if (!check_cublas("set workspace",
+                          cublas->cublasLtMatmulPreferenceSetAttribute(
+                              out->preference,
+                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                              &out->workspace_size,
+                              sizeof(out->workspace_size)))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        int found = 0;
+        if (check_cublas("heuristic",
+                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
+                                                                out->op_desc,
+                                                                out->a_layout,
+                                                                out->b_layout,
+                                                                out->c_layout,
+                                                                out->d_layout,
+                                                                out->preference,
+                                                                1,
+                                                                &out->heuristic,
+                                                                &found)) &&
+            found > 0) {
+            out->ready = 1;
+            return 1;
+        }
+
+        destroy_profile(cublas, cuda, out);
+        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
+        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
+            break;
+        }
    }

-    if (!check_cublas("set workspace",
-                      cublas->cublasLtMatmulPreferenceSetAttribute(
-                          out->preference,
-                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                          &out->workspace_size,
-                          sizeof(out->workspace_size)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    int found = 0;
-    if (!check_cublas("heuristic",
-                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
-                                                             out->op_desc,
-                                                             out->a_layout,
-                                                             out->b_layout,
-                                                             out->c_layout,
-                                                             out->d_layout,
-                                                             out->preference,
-                                                             1,
-                                                             &out->heuristic,
-                                                             &found))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-    if (found <= 0) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    out->ready = 1;
-    return 1;
+    return 0;
 }

 static int run_cublas_profile(cublasLtHandle_t handle,
                              struct cublaslt_api *cublas,
                              struct prepared_profile *profile) {
+    int32_t alpha_i32 = 1;
+    int32_t beta_i32 = 0;
+    double alpha_f64 = 1.0;
+    double beta_f64 = 0.0;
    float alpha = 1.0f;
    float beta = 0.0f;
+    const void *alpha_ptr = &alpha;
+    const void *beta_ptr = &beta;
+    if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
+        alpha_ptr = &alpha_i32;
+        beta_ptr = &beta_i32;
+    } else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
+        alpha_ptr = &alpha_f64;
+        beta_ptr = &beta_f64;
+    }
    return check_cublas(profile->desc.name,
                        cublas->cublasLtMatmul(handle,
                                               profile->op_desc,
-                                               &alpha,
+                                               alpha_ptr,
                                               (const void *)(uintptr_t)profile->a_dev,
                                               profile->a_layout,
                                               (const void *)(uintptr_t)profile->b_dev,
                                               profile->b_layout,
-                                               &beta,
+                                               beta_ptr,
                                               (const void *)(uintptr_t)profile->c_dev,
                                               profile->c_layout,
                                               (void *)(uintptr_t)profile->d_dev,
@@ -1121,9 +1194,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                               int cc_minor,
                               int seconds,
                               int size_mb,
+                               const char *precision_filter,
                               struct stress_report *report) {
    struct cublaslt_api cublas;
-    struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
+    struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
    cublasLtHandle_t handle = NULL;
    CUcontext ctx = NULL;
    CUstream streams[MAX_STRESS_STREAMS] = {0};
@@ -1133,11 +1207,12 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    int active = 0;
    int mp_count = 0;
    int stream_count = 1;
-    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
+    int profile_count = PROFILE_COUNT;
    int prepared_count = 0;
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
+    int budget_profiles = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1158,8 +1233,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

+    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned++;
        }
    }
@@ -1170,18 +1246,42 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

+    /* Count all profiles active on this GPU regardless of filter.
+     * Mixed phases still divide budget across the full precision set, while
+     * single-precision benchmark phases dedicate budget only to active
+     * profiles matching precision_filter. */
+    int planned_total = 0;
+    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
+            planned_total++;
+        }
+    }
+    if (planned_total < planned) {
+        planned_total = planned;
+    }
+    budget_profiles = planned_total;
+    if (precision_filter != NULL) {
+        budget_profiles = planned;
+    }
+    if (budget_profiles <= 0) {
+        budget_profiles = planned_total;
+    }
+
    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
+    }
+    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
+        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1194,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
+    if (precision_filter != NULL) {
+        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
+    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  mp_count,
+                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));

    for (int i = 0; i < profile_count; i++) {
@@ -1218,6 +1322,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
+        if (!profile_allowed_for_run(desc, cc, precision_filter)) {
+            append_detail(report->details,
+                          sizeof(report->details),
+                          "%s=SKIPPED benchmark_disabled\n",
+                          desc->name);
+            continue;
+        }
        for (int lane = 0; lane < stream_count; lane++) {
            CUstream stream = streams[lane];
            if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
@@ -1335,10 +1446,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
 }
 #endif

+static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
+    printf("device=%s\n", report->device);
+    printf("device_index=%d\n", device_index);
+    printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
+    printf("backend=%s\n", report->backend);
+    printf("duration_s=%d\n", seconds);
+    printf("buffer_mb=%d\n", report->buffer_mb);
+    printf("streams=%d\n", report->stream_count);
+    printf("iterations=%lu\n", report->iterations);
+    printf("checksum=%llu\n", (unsigned long long)report->checksum);
+    if (report->details[0] != '\0') {
+        printf("%s", report->details);
+    }
+    printf("status=OK\n");
+}
+
 int main(int argc, char **argv) {
    int seconds = 5;
    int size_mb = 64;
    int device_index = 0;
+    const char *precision_filter = NULL; /* NULL = all; else block_label to match */
+    const char *precision_plan = NULL;
+    const char *precision_plan_seconds = NULL;
    for (int i = 1; i < argc; i++) {
        if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
            seconds = atoi(argv[++i]);
@@ -1346,8 +1476,16 @@ int main(int argc, char **argv) {
            size_mb = atoi(argv[++i]);
        } else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
            device_index = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
+            precision_filter = argv[++i];
+        } else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
+            precision_plan = argv[++i];
+        } else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
+            precision_plan_seconds = argv[++i];
        } else {
-            fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
+            fprintf(stderr,
+                    "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
+                    argv[0]);
            return 2;
        }
    }
@@ -1407,26 +1545,94 @@ int main(int argc, char **argv) {
    int ok = 0;

 #if HAVE_CUBLASLT_HEADERS
-    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
+    if (precision_plan != NULL && precision_plan[0] != '\0') {
+        char *plan_copy = strdup(precision_plan);
+        char *plan_seconds_copy = NULL;
+        int phase_seconds[32] = {0};
+        int phase_seconds_count = 0;
+        int phase_ok = 0;
+        if (plan_copy == NULL) {
+            fprintf(stderr, "failed to allocate precision plan buffer\n");
+            return 1;
+        }
+        if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
+            plan_seconds_copy = strdup(precision_plan_seconds);
+            if (plan_seconds_copy == NULL) {
+                free(plan_copy);
+                fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
+                return 1;
+            }
+            for (char *sec_token = strtok(plan_seconds_copy, ",");
+                 sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
+                 sec_token = strtok(NULL, ",")) {
+                while (*sec_token == ' ' || *sec_token == '\t') {
+                    sec_token++;
+                }
+                if (*sec_token == '\0') {
+                    continue;
+                }
+                phase_seconds[phase_seconds_count++] = atoi(sec_token);
+            }
+        }
+        int phase_idx = 0;
+        for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
+            while (*token == ' ' || *token == '\t') {
+                token++;
+            }
+            if (*token == '\0') {
+                continue;
+            }
+            const char *phase_name = token;
+            const char *phase_filter = token;
+            if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
+                phase_filter = NULL;
+            }
+            int phase_duration = seconds;
+            if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
+                phase_duration = phase_seconds[phase_idx];
+            }
+            printf("phase_begin=%s\n", phase_name);
+            fflush(stdout);
+            memset(&report, 0, sizeof(report));
+            ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
+            if (ok) {
+                print_stress_report(&report, device_index, phase_duration);
+                phase_ok = 1;
+            } else {
+                printf("phase_error=%s\n", phase_name);
+                if (report.details[0] != '\0') {
+                    printf("%s", report.details);
+                    if (report.details[strlen(report.details) - 1] != '\n') {
+                        printf("\n");
+                    }
+                }
+                printf("status=FAILED\n");
+            }
+            printf("phase_end=%s\n", phase_name);
+            fflush(stdout);
+        }
+        free(plan_seconds_copy);
+        free(plan_copy);
+        return phase_ok ? 0 : 1;
+    }
+    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
 #endif
    if (!ok) {
-        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
+        if (precision_filter != NULL) {
+            fprintf(stderr,
+                    "requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
+                    precision_filter,
+                    name,
+                    cc_major,
+                    cc_minor);
+            return 1;
+        }
+        int ptx_mb = size_mb;
+        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
            return 1;
        }
    }

-    printf("device=%s\n", report.device);
-    printf("device_index=%d\n", device_index);
-    printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
-    printf("backend=%s\n", report.backend);
-    printf("duration_s=%d\n", seconds);
-    printf("buffer_mb=%d\n", report.buffer_mb);
-    printf("streams=%d\n", report.stream_count);
-    printf("iterations=%lu\n", report.iterations);
-    printf("checksum=%llu\n", (unsigned long long)report.checksum);
-    if (report.details[0] != '\0') {
-        printf("%s", report.details);
-    }
-    printf("status=OK\n");
+    print_stress_report(&report, device_index, seconds);
    return 0;
 }
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -161,6 +161,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
+            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}" \
@@ -175,6 +176,7 @@ run_variant() {
            -e GOMODCACHE=/cache/go-mod \
            -e TMPDIR=/cache/tmp \
            -e BEE_CACHE_DIR=/cache/bee \
+            -e BEE_REQUIRE_MEMTEST=1 \
            -w /work \
            "${IMAGE_REF}" \
            sh /work/iso/builder/build.sh --variant "${_v}"
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -57,6 +57,7 @@ OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
 export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT

 . "${BUILDER_DIR}/VERSIONS"
+export MEMTEST_VERSION
 export PATH="$PATH:/usr/local/go/bin"
 : "${BEE_REQUIRE_MEMTEST:=0}"

@@ -775,6 +776,7 @@ run_optional_step_sh() {
        return 0
    fi

+    mkdir -p "${LOG_DIR}" 2>/dev/null || true
    step_log="${LOG_DIR}/${step_slug}.log"
    echo ""
    echo "=== optional step: ${step_name} ==="
@@ -798,13 +800,14 @@ start_build_log
 # install them on the fly so NVIDIA modules and ISO kernel always match.
 if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
    echo "=== refreshing apt index to detect current kernel ABI ==="
-    apt-get update -qq
+    apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
    DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
        | awk '/Depends:.*linux-image-[0-9]/{print $2}' \
        | grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
        | head -1)
    if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
        echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
+        echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
        exit 1
    fi
    echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
@@ -873,9 +876,37 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then

    CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"

+    echo "=== bee-gpu-burn FP4 header probe ==="
+    fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
+    fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
+    if [ -n "$fp4_type_match" ]; then
+        echo "fp4_header_symbol=present"
+        echo "$fp4_type_match"
+    else
+        echo "fp4_header_symbol=missing"
+    fi
+    if [ -n "$fp4_scale_match" ]; then
+        echo "fp4_scale_mode_symbol=present"
+        echo "$fp4_scale_match"
+    else
+        echo "fp4_scale_mode_symbol=missing"
+    fi
+
    GPU_STRESS_NEED_BUILD=1
-    if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
+    if [ -f "$GPU_BURN_WORKER_BIN" ]; then
        GPU_STRESS_NEED_BUILD=0
+        for dep in \
+            "${BUILDER_DIR}/bee-gpu-stress.c" \
+            "${BUILDER_DIR}/VERSIONS"; do
+            if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
+                GPU_STRESS_NEED_BUILD=1
+                break
+            fi
+        done
+        if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
+            find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
+            GPU_STRESS_NEED_BUILD=1
+        fi
    fi

    if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
@@ -889,6 +920,12 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    else
        echo "=== bee-gpu-burn worker up to date, skipping build ==="
    fi
+    echo "=== bee-gpu-burn compiled profile probe ==="
+    if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
+        echo "fp4_profile_string=present"
+    else
+        echo "fp4_profile_string=missing"
+    fi
 fi

 echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
@@ -1225,6 +1262,7 @@ fi
 # --- substitute version placeholders in package list and archive ---
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    sed -i \
+        -e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
        -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
        "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
@@ -1267,7 +1305,7 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER

 cd "${LB_DIR}"
-run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
+run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
 run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
 dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -43,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
 # Enable GPU-vendor specific services
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    systemctl enable nvidia-dcgm.service 2>/dev/null || true
+    systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
    systemctl enable bee-nvidia.service
 elif [ "$GPU_VENDOR" = "amd" ]; then
    # ROCm symlinks (packages install to /opt/rocm-*/bin/)
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -1,117 +0,0 @@
-#!/bin/sh
-# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
-set -e
-echo "=== generating bee wallpaper ==="
-mkdir -p /usr/share/bee
-
-python3 - <<'PYEOF'
-from PIL import Image, ImageDraw, ImageFont, ImageFilter
-import os
-
-W, H = 1920, 1080
-
-ASCII_ART = [
-    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
-    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
-    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
-    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
-    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
-    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
-]
-SUBTITLE = "  Hardware Audit LiveCD"
-
-FG = (0xF6, 0xD0, 0x47)
-FG_DIM = (0xD4, 0xA9, 0x1C)
-SHADOW = (0x5E, 0x47, 0x05)
-SUB = (0x96, 0x7A, 0x17)
-BG = (0x05, 0x05, 0x05)
-
-MONO_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
-]
-SUB_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
-]
-
-
-def load_font(candidates, size):
-    for path in candidates:
-        if os.path.exists(path):
-            return ImageFont.truetype(path, size)
-    return ImageFont.load_default()
-
-
-def mono_metrics(font):
-    probe = Image.new('L', (W, H), 0)
-    draw = ImageDraw.Draw(probe)
-    char_w = int(round(draw.textlength("M", font=font)))
-    bb = draw.textbbox((0, 0), "Mg", font=font)
-    char_h = bb[3] - bb[1]
-    return char_w, char_h
-
-
-def render_ascii_mask(font, lines, char_w, char_h, line_gap):
-    width = max(len(line) for line in lines) * char_w
-    height = len(lines) * char_h + line_gap * (len(lines) - 1)
-    mask = Image.new('L', (width, height), 0)
-    draw = ImageDraw.Draw(mask)
-    for row, line in enumerate(lines):
-        y = row * (char_h + line_gap)
-        for col, ch in enumerate(line):
-            if ch == ' ':
-                continue
-            x = col * char_w
-            draw.text((x, y), ch, font=font, fill=255)
-    return mask
-
-
-img = Image.new('RGB', (W, H), BG)
-draw = ImageDraw.Draw(img)
-
-# Soft amber glow under the logo without depending on font rendering.
-glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
-glow_draw = ImageDraw.Draw(glow)
-glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
-glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
-glow = glow.filter(ImageFilter.GaussianBlur(60))
-img = Image.alpha_composite(img.convert('RGBA'), glow)
-
-TARGET_LOGO_W = 400
-max_chars = max(len(line) for line in ASCII_ART)
-_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
-_probe_cw, _ = mono_metrics(_probe_font)
-font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
-font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
-char_w, char_h = mono_metrics(font_logo)
-logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
-logo_w, logo_h = logo_mask.size
-logo_x = (W - logo_w) // 2
-logo_y = 380
-
-sh_off = max(1, font_size_logo // 6)
-shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
-img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
-img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
-img.paste(FG, (logo_x, logo_y), logo_mask)
-
-font_sub = load_font(SUB_FONT_CANDIDATES, 30)
-sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
-sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
-sub_y = logo_y + logo_h + 48
-draw = ImageDraw.Draw(img)
-draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
-draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
-
-img = img.convert('RGB')
-
-img.save('/usr/share/bee/wallpaper.png', optimize=True)
-print('wallpaper written: /usr/share/bee/wallpaper.png')
-PYEOF
-
-echo "=== wallpaper done ==="
--- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary
+++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary
@@ -5,6 +5,8 @@ set -e

 : "${BEE_REQUIRE_MEMTEST:=0}"

+# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
+# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
 MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
 BINARY_BOOT_DIR="binary/boot"
 GRUB_CFG="binary/boot/grub/grub.cfg"
@@ -24,15 +26,23 @@ fail_or_warn() {
    return 0
 }

+# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
+# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
+# The template already has memtest entries hardcoded, so a missing config file
+# here is not an error; validate_iso_memtest() checks the final ISO instead.
+warn_only() {
+    log "WARNING: $1"
+}
+
 copy_memtest_file() {
    src="$1"
-    base="$(basename "$src")"
-    dst="${BINARY_BOOT_DIR}/${base}"
+    dst_name="${2:-$(basename "$src")}"
+    dst="${BINARY_BOOT_DIR}/${dst_name}"

    [ -f "$src" ] || return 1
    mkdir -p "${BINARY_BOOT_DIR}"
    cp "$src" "$dst"
-    log "copied ${base} from ${src}"
+    log "copied ${dst_name} from ${src}"
 }

 extract_memtest_from_deb() {
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {

    log "extracting memtest payload from ${deb}"
    dpkg-deb -x "$deb" "$tmpdir"
-    for f in ${MEMTEST_FILES}; do
-        if [ -f "${tmpdir}/boot/${f}" ]; then
-            copy_memtest_file "${tmpdir}/boot/${f}"
-        fi
-    done
+
+    # EFI binary: both 5.x and 6.x use memtest86+x64.efi
+    if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
+    fi
+
+    # BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
+    if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
+    elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
+        copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
+    fi
+
    rm -rf "$tmpdir"
 }

+download_and_extract_memtest() {
+    tmpdl="$(mktemp -d)"
+    if [ -n "${MEMTEST_VERSION:-}" ]; then
+        pkg_spec="memtest86+=${MEMTEST_VERSION}"
+    else
+        pkg_spec="memtest86+"
+    fi
+    log "downloading ${pkg_spec} from apt"
+    if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
+        log "apt download failed, retrying after apt-get update"
+        apt-get update -qq >/dev/null 2>&1 || true
+        ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
+    fi
+    deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
+    if [ -n "$deb" ]; then
+        extract_memtest_from_deb "$deb"
+    else
+        log "apt download of memtest86+ failed"
+    fi
+    rm -rf "$tmpdl"
+}
+
 ensure_memtest_binaries() {
    missing=0
    for f in ${MEMTEST_FILES}; do
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0

+    # 1. Try files already placed by lb binary_memtest or chroot
    for root in chroot/boot /boot; do
        for f in ${MEMTEST_FILES}; do
            [ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
        done
+        # 6.x BIOS binary may lack x64 in name — copy with normalised name
+        if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
+            copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
+        fi
    done

    missing=0
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
    done
    [ "$missing" -eq 1 ] || return 0

+    # 2. Try apt package cache (may be empty if lb binary_memtest already purged)
    for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
        [ -d "$root" ] || continue
        deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
        break
    done

+    missing=0
+    for f in ${MEMTEST_FILES}; do
+        [ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
+    done
+    [ "$missing" -eq 1 ] || return 0
+
+    # 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
+    download_and_extract_memtest
+
    missing=0
    for f in ${MEMTEST_FILES}; do
        if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {

 ensure_grub_entry() {
    [ -f "$GRUB_CFG" ] || {
-        fail_or_warn "missing ${GRUB_CFG}"
+        warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
        return 0
    }

@@ -114,7 +169,7 @@ EOF

 ensure_isolinux_entry() {
    [ -f "$ISOLINUX_CFG" ] || {
-        fail_or_warn "missing ${ISOLINUX_CFG}"
+        warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
        return 0
    }

--- a/iso/builder/config/package-lists/bee-nvidia.list.chroot
+++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot
@@ -5,6 +5,7 @@
 # DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
 # CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
 # explicitly.
+nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
 datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -6,10 +6,13 @@ STAGGER_SECONDS=0
 SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
+PRECISION=""
+PRECISION_PLAN=""
+PRECISION_PLAN_SECONDS=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"

 usage() {
-    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
    exit 2
 }

@@ -30,6 +33,9 @@ while [ "$#" -gt 0 ]; do
        --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
+        --precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
+        --precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
+        --precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
        *) usage ;;
    esac
 done
@@ -88,8 +94,14 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
    extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
    gpu_seconds=$(( SECONDS + extra_sec ))
    echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
+    precision_arg=""
+    [ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
+    precision_plan_arg=""
+    [ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
+    precision_plan_seconds_arg=""
+    [ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
    CUDA_VISIBLE_DEVICES="${id}" \
-        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -258,6 +258,22 @@ else
    log "WARN: nvidia-smi not found — cannot enable persistence mode"
 fi

+# Start or refresh Fabric Manager after the NVIDIA stack is ready. On NVSwitch
+# systems CUDA/DCGM can report "system not yet initialized" until fabric
+# training completes under nvidia-fabricmanager.
+if command -v systemctl >/dev/null 2>&1 && systemctl list-unit-files --no-legend 2>/dev/null | grep -q '^nvidia-fabricmanager\.service'; then
+    if systemctl restart nvidia-fabricmanager.service >/dev/null 2>&1; then
+        log "nvidia-fabricmanager restarted"
+    elif systemctl start nvidia-fabricmanager.service >/dev/null 2>&1; then
+        log "nvidia-fabricmanager started"
+    else
+        log "WARN: failed to start nvidia-fabricmanager.service"
+        systemctl status nvidia-fabricmanager.service --no-pager 2>&1 | sed 's/^/  fabricmanager: /' || true
+    fi
+else
+    log "WARN: nvidia-fabricmanager.service not installed"
+fi
+
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
 # If it started too early (for example via systemd before bee-nvidia-load), it can
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -9,9 +9,9 @@ xset s noblank

 # Set desktop background.
 if [ -f /usr/share/bee/wallpaper.png ]; then
-    feh --bg-fill /usr/share/bee/wallpaper.png
+    feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
 else
-    xsetroot -solid '#f6c90e'
+    xsetroot -solid '#000000'
 fi

 tint2 &
--- a/iso/overlay/usr/share/bee/wallpaper.png
+++ b/iso/overlay/usr/share/bee/wallpaper.png
Author	SHA1	Message	Date
Mikhail Chusavitin	dca4afb8d0	Seed power ramp with single-card TDP limits	2026-04-16 11:43:01 +03:00
Mikhail Chusavitin	b4280941f5	Move NCCL and NVBandwidth into validate mode	2026-04-16 11:02:30 +03:00
Mikhail Chusavitin	f74976ec4c	Use static overlay wallpaper in ISO build	2026-04-16 10:54:03 +03:00
Mikhail Chusavitin	18e24a9aa5	Estimate fan duty from observed RPM maxima	2026-04-16 10:10:18 +03:00
Mikhail Chusavitin	e306250da7	Disable fp64/fp4 in mixed gpu burn	2026-04-16 10:00:03 +03:00
Mikhail Chusavitin	c5b2081ac9	Disable unstable fp4/fp64 benchmark phases	2026-04-16 09:58:02 +03:00
Michael Chus	434528083e	Power bench: compare GPU-reported TDP vs IPMI server power delta - NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower - RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via background goroutine throughout Phase 2 ramp - renderPowerBenchReport: new "Server vs GPU Power Comparison" table with ratio annotation (✓ match / ⚠ minor / ✗ over-report) - renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w, server_reporting_ratio keys Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 07:21:02 +03:00
Michael Chus	30aa30cd67	LiveCD: set Baby Bee wallpaper centered on black background 400×400px PNG centered via feh --bg-center --image-bg '#000000'. Fallback solid fill also changed to black. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:57:23 +03:00
Michael Chus	4f76e1de21	Dashboard: per-device status chips with hover tooltips Replace single aggregated badge per hardware category with individual colored chips (O/W/F/?) for each ComponentStatusRecord. Added helper functions: matchedRecords, firstNonEmpty. CSS classes: chip-ok/warn/fail/unknown. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:54:13 +03:00
Michael Chus	3732e64a4a	Add slowdown temperature exceedance detector to benchmark detectSlowdownTempExceedance scans steady-state metric rows per GPU and emits a [WARNING] note + PARTIAL status if any sample >= SlowdownTempC. Uses per-GPU threshold from nvidia-smi -q, fallback 80°C. Distinct from p95-based TempHeadroomC check: catches even a single spike above the slowdown threshold that would be smoothed out in aggregates. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:46:45 +03:00
Michael Chus	0d925299ff	Use per-GPU temperature limits from nvidia-smi -q for headroom calculation Parse "GPU Shutdown Temp" and "GPU Slowdown Temp" from nvidia-smi -q verbose output in enrichGPUInfoWithMaxClocks. Store as ShutdownTempC/SlowdownTempC on benchmarkGPUInfo and BenchmarkGPUResult. Fallback: 90°C shutdown / 80°C slowdown when not available. TempHeadroomC = ShutdownTempC - P95TempC (per-GPU, not hardcoded 100°C). Warning threshold: p95 >= SlowdownTempC. Critical: headroom < 10°C. Report table shows both limits alongside headroom and p95 temp. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:45:15 +03:00
Michael Chus	a8d5e019a5	Translate report to English; add power anomaly detector All report strings are now English only. Add detectPowerAnomaly: scans steady-state metric rows per GPU with a 5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50% as [HARD STOP] — indicates bad cable contact or VRM fault. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:42:00 +03:00
Michael Chus	72ec086568	Restructure benchmark report as balanced scorecard (5 perspectives) Split throttle into separate signals: ThermalThrottlePct, PowerCapThrottlePct, SyncBoostThrottlePct. Add TempHeadroomC (100 - p95_temp) as independent thermal headroom metric; warning < 20°C (>80°C), critical < 10°C (>90°C). Hard stop findings: thermal throttle with fans < 95%, ECC uncorrected errors, p95 temp > 90°C. Throttle findings now include per-type percentages and diagnostic context. Replace flat scorecard table with BSC 5-perspective layout: 1. Compatibility (hard stops: thermal+fan, ECC) 2. Thermal headroom (p95 temp, delta to 100°C, throttle %) 3. Power delivery (power cap throttle, power CV, fan duty) 4. Performance (Compute TOPS, Synthetic, Mixed, TOPS/SM/GHz) 5. Anomalies (ECC corrected, sync boost, power/thermal variance) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:40:06 +03:00
Michael Chus	7a0b0934df	Separate compute score from server quality score CompositeScore = raw ComputeScore (TOPS). Throttling GPUs score lower automatically — no quality multiplier distorting the compute signal. Add ServerQualityScore (0-100): server infrastructure quality independent of GPU model. Formula: 0.40×Stability + 0.30×PowerSustain + 0.30×Thermal. Use to compare servers with the same GPU or flag bad server conditions. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:45:55 +03:00
Michael Chus	d8ca0dca2c	Redesign scoring metrics: variance-based sustain scores, throttle stability PowerSustainScore: power draw variance (CV) during load, not deviation from TDP. ThermalSustainScore: temperature variance (CV) during load. StabilityScore: fraction of time spent in thermal+power-cap throttling. Remove NCCL bonus from quality_factor. quality = 0.35 + 0.35×Stability + 0.15×PowerSustain + 0.15×ThermalSustain, cap 1.00. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:39:59 +03:00
Michael Chus	d90250f80a	Fix DCGM cleanup and shorten memory validate	2026-04-16 00:39:37 +03:00
Michael Chus	8d6eaef5de	Update perf benchmark report methodology to reflect new design Remove references to pre-benchmark power calibration and dcgmi targeted_power. Document platform_power_score ramp-up methodology, PowerSustainScore fallback to steady-state power, and full-budget single-precision phases. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:31:58 +03:00
Michael Chus	732bf4cbab	Redesign power and performance benchmarks with new methodology Power/Thermal Fit: cumulative fixed-limit ramp where each GPU's stable TDP is found under real multi-GPU thermal load (all prior GPUs running at their fixed limits). PlatformMaxTDPW = sum of stable limits across all GPUs. Remove PlatformPowerScore from power test. Performance Benchmark: remove pre-benchmark power calibration entirely. After N single-card runs, execute k=2..N parallel ramp-up steps and compute PlatformPowerScore = mean compute scalability vs best single-card TOPS. PowerSustainScore falls back to Steady.AvgPowerW when calibration absent. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:30:50 +03:00
Michael Chus	fa6d905a10	Tune bee-gpu-burn single-precision benchmark phases	2026-04-16 00:05:47 +03:00
Mikhail Chusavitin	5c1862ce4c	Use lb clean --all to clear bootstrap cache on every build Prevents stale debootstrap cache from bypassing --debootstrap-options changes (e.g. --include=ca-certificates added in v8.15). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 17:37:08 +03:00
Mikhail Chusavitin	b65ef2ea1d	Fix: use --debootstrap-options to include ca-certificates in bootstrap --bootstrap-packages is not a valid lb config option (20230502). Use --debootstrap-options "--include=ca-certificates" instead to ensure ca-certificates is present when lb chroot_archives runs apt-get update against the NVIDIA CUDA HTTPS source. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 17:26:01 +03:00
Mikhail Chusavitin	533d703c97	Bootstrap ca-certificates so NVIDIA CUDA HTTPS source is trusted debootstrap creates a minimal chroot without ca-certificates, causing apt-get update to fail TLS verification for the NVIDIA CUDA apt source: "No system certificates available. Try installing ca-certificates." Add ca-certificates to --bootstrap-packages so it is present before lb chroot_archives configures the NVIDIA HTTPS source and runs apt-get update. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 17:24:20 +03:00
Mikhail Chusavitin	04eb4b5a6d	Revert "Pre-download DCGM/fabricmanager debs on host to bypass chroot apt" This reverts commit `4110dbf8a6`.	2026-04-15 17:19:53 +03:00
Mikhail Chusavitin	4110dbf8a6	Pre-download DCGM/fabricmanager debs on host to bypass chroot apt The NVIDIA CUDA HTTPS apt source (developer.download.nvidia.com) may be unreachable from inside the live-build container chroot, causing 'E: Unable to locate package datacenter-gpu-manager-4-cuda13'. Add build-dcgm.sh that downloads DCGM and nvidia-fabricmanager .deb packages on the build host (verifying SHA256 against Packages.gz) and caches them in BEE_CACHE_DIR. build.sh (step 25-dcgm, nvidia only) copies them into LB_DIR/config/packages.chroot/ before lb build, so live-build creates a local apt repo from them. The chroot installs the packages from the local repo without ever contacting the NVIDIA CUDA HTTPS source. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 17:10:23 +03:00
Mikhail Chusavitin	7237e4d3e4	Add fabric manager boot and support diagnostics	2026-04-15 16:14:26 +03:00
Mikhail Chusavitin	ab3ad77cd6	Fix Go module: upgrade modernc.org/libc v1.70.0 → v1.72.0 modernc.org/sqlite v1.48.0 requires modernc.org/libc/sys/types which is absent in v1.70.0 but present in v1.72.0. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 14:32:04 +03:00
Mikhail Chusavitin	cd9e2cbe13	Fix ramp-up power bench: one task instead of N redundant tasks RunNvidiaPowerBench already performs a full internal ramp from 1 to N GPUs in Phase 2. Spawning N tasks with growing GPU subsets meant task K repeated all steps 1..K-1 already done by tasks 1..K-1 — O(N²) work instead of O(N). Replace with a single task using all selected GPUs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 12:29:11 +03:00
Mikhail Chusavitin	0317dc58fd	Fix memtest hook: grub.cfg/live.cfg missing during binary hooks is expected lb binary_grub-efi and lb binary_syslinux create these files from templates that already have memtest entries hardcoded. The hook should not fail when the files don't exist yet — validate_iso_memtest() checks the final ISO. Only the binary files (x64.bin, x64.efi) are required here. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:33:22 +03:00
Mikhail Chusavitin	1c5cb45698	Fix memtest hook: bad ver_arg format in apt-get download ver_arg was set to "=memtest86+=VERSION" making the command "apt-get download memtest86+=memtest86+=VERSION" (invalid). Fixed to build pkg_spec directly as "memtest86+=VERSION". Also add apt-get update retry if initial download fails. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:15:01 +03:00
Mikhail Chusavitin	090b92ca73	Re-enable security repo: kernel 6.1.0-44 is in bookworm-security only Disabling --security broke the build because linux-image-6.1.0-44-amd64 is a security update not present in the base bookworm repo. Main packages already come from mirror.mephi.ru. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 10:02:52 +03:00
Mikhail Chusavitin	2dccbc010c	Use MEPHI mirror, disable security repo, fix memtest in ISO build - Switch all lb mirrors to mirror.mephi.ru/debian/ for faster/reliable downloads - Disable security repo (--security false) — not needed for LiveCD - Pin MEMTEST_VERSION=6.10-4 in VERSIONS, export to hook environment - Set BEE_REQUIRE_MEMTEST=1 in build-in-container.sh — missing memtest is now fatal - Fix 9100-memtest.hook.binary: add apt-get download fallback when lb binary_memtest has already purged the package cache; handle both 5.x (memtest86+x64.bin) and 6.x (memtest86+.bin) BIOS binary naming Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 09:57:29 +03:00
Michael Chus	e84c69d360	Fix optional step log dir missing after memtest recovery mkdir -p LOG_DIR before writing the optional step log so that a race with cleanup_build_log (EXIT trap archiving the log dir) does not cause a "Directory nonexistent" error during lb binary_checksums / lb binary_iso. Also downgrade apt-get update failure to a warning so a transient mirror outage does not block kernel ABI auto-detection when the apt cache is warm. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 07:28:36 +03:00
Michael Chus	c80a39e7ac	Add power results table, fix benchmark results refresh, bound memtester - Benchmark page now shows two result sections: Performance (scores) and Power / Thermal Fit (slot table). After any benchmark task completes the results section auto-refreshes via GET /api/benchmark/results without a full page reload. - Power results table shows each GPU slot with nominal TDP, achieved stable power limit, and P95 observed power. Rows with derated cards are highlighted amber so under-performing slots stand out at a glance. Older runs are collapsed in a <details> summary. - memtester is now wrapped with timeout(1) so a stuck memory controller cannot cause Validate Memory to hang indefinitely. Wall-clock limit is ~2.5 min per 100 MB per pass plus a 2-minute buffer. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 07:16:18 +03:00
Michael Chus	a5e0261ff2	Refactor power ramp to use true single-card baselines Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 23:47:57 +03:00
Michael Chus	ee422ede3c	Revert "Add raster Easy Bee branding assets" This reverts commit `d560b2fead`.	2026-04-14 23:00:15 +03:00
Michael Chus	d560b2fead	Add raster Easy Bee branding assets	2026-04-14 22:39:25 +03:00
Michael Chus	3cf2e9c9dc	Run power calibration for all GPUs simultaneously Previously each GPU was calibrated sequentially (one card fully done before the next started), producing the staircase temperature pattern seen on the graph. Now all GPUs run together in a single dcgmi diag -r targeted_power session per attempt. This means: - All cards are under realistic thermal load at the same time. - A single DCGM session handles the run — no resource-busy contention from concurrent dcgmi processes. - Binary search state (lo/hi) is tracked independently per GPU; each card converges to its own highest stable power limit. - Throttle counter polling covers all active GPUs in the shared ticker. - Resource-busy exponential back-off is shared (one DCGM session). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:25:05 +03:00
Michael Chus	19dbabd71d	Simplify power calibration: pure binary search, no telemetry guessing Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:12:45 +03:00
Michael Chus	a6a07f2626	Replace linear power derate with binary search + telemetry-guided jump Power calibration previously stepped down 25 W at a time (linear), requiring up to 6 attempts to find a stable limit within 150 W range. New strategy: - Binary search between minLimitW (lo, assumed stable floor) and the starting/failed limit (hi, confirmed unstable), converging within a 10 W tolerance in ~4 attempts. - For thermal throttle: the first-quarter telemetry rows estimate the GPU's pre-throttle power draw. nextLimit = round5W(onset - 10 W) is used as the initial candidate instead of the binary midpoint, landing much closer to the true limit on the first step. - On success: lo is updated and a higher level is tried (binary search upward) until hi-lo ≤ tolerance, ensuring the highest stable limit is found rather than the first stable one. - Let targeted_power run to natural completion on throttle (no mid-run SIGKILL) so nv-hostengine releases its diagnostic slot cleanly before the next attempt. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:05:23 +03:00
Michael Chus	f87461ee4a	Detect thermal throttle with fans below 100% as cooling misconfiguration During power calibration: if a thermal throttle (sw_thermal/hw_thermal) causes ≥20% clock drop while server fans are below 98% P95 duty cycle, record a CoolingWarning on the GPU result and emit an actionable finding telling the operator to rerun with fans manually fixed at 100%. During steady-state benchmark: same signal enriches the existing thermal_limited finding with fan duty cycle and clock drift values. Covers both the main benchmark (buildBenchmarkFindings) and the power bench (NvidiaPowerBenchResult.Findings). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 21:44:57 +03:00
Michael Chus	a636146dbd	Fix power calibration failing due to DCGM resource contention When a targeted_power attempt is cancelled (e.g. after sw_thermal throttle), nv-hostengine holds the diagnostic slot asynchronously. The next attempt immediately received DCGM_ST_IN_USE (exit 222) and incorrectly derated the power limit. Now: exit 222 is detected via isDCGMResourceBusy and triggers an exponential back-off retry at the same power limit (1s, 2s, 4s, … up to 256s). Once the back-off delay would exceed 300s the calibration fails, indicating the slot is persistently held. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 20:41:17 +03:00
Mikhail Chusavitin	303de2df04	Add slot-aware ramp sequence to bee-bench power	2026-04-14 17:47:40 +03:00
Mikhail Chusavitin	95124d228f	Split bee-bench into perf and power workflows	2026-04-14 17:33:13 +03:00
Mikhail Chusavitin	54338dbae5	Unify live RAM runtime state	2026-04-14 16:18:33 +03:00
Mikhail Chusavitin	2be7ae6d28	Refine NVIDIA benchmark phase timing	2026-04-14 14:12:06 +03:00
Mikhail Chusavitin	b1a5035edd	Normalize task queue priorities by workflow	2026-04-14 11:13:54 +03:00
Mikhail Chusavitin	8fc986c933	Add benchmark fan duty cycle summary to report	2026-04-14 10:24:02 +03:00
Mikhail Chusavitin	88b5e0edf2	Harden IPMI power probe timeout	2026-04-14 10:18:23 +03:00
Mikhail Chusavitin	82fe1f6d26	Disable precision fallback and pin cuBLAS 13.1	2026-04-14 10:17:44 +03:00
Michael Chus	81e7c921f8	дебаг при сборке	2026-04-14 07:02:37 +03:00
Michael Chus	0fb8f2777f	Fix combined gpu burn profile capacity for fp4	2026-04-14 00:00:40 +03:00
Michael Chus	bf182daa89	Fix benchmark report methodology and rebuild gpu burn worker on toolchain changes	2026-04-13 23:43:12 +03:00
Michael Chus	457ea1cf04	Unify benchmark exports and drop ASCII charts	2026-04-13 21:38:28 +03:00
Michael Chus	bf6ecab4f0	Add per-precision benchmark phases, weighted TOPS scoring, and ECC tracking - Split steady window into 6 equal slots: fp8/fp16/fp32/fp64/fp4 + combined - Each precision phase runs bee-gpu-burn with --precision filter so PowerCVPct reflects single-kernel stability (not round-robin artifact) - Add fp4 support in bee-gpu-stress.c for Blackwell (cc>=100) via existing CUDA_R_4F_E2M1 guard - Weighted TOPS: fp64×2.0, fp32×1.0, fp16×0.5, fp8×0.25, fp4×0.125 - SyntheticScore = sum of weighted TOPS from per-precision phases - MixedScore = sum from combined phase; MixedEfficiency = Mixed/Synthetic - ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3) - ECC volatile counters sampled before/after each phase and overall - DegradationReasons: ecc_uncorrected_errors, ecc_corrected_errors - Report: per-precision stability table with ECC columns, methodology section - Ramp-up history table redesign: GPU indices as columns, runs as rows Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-13 10:49:49 +03:00